#!/usr/bin/env /usr/bin/python3 # -*- coding:utf-8 -*- import hashlib # 企业库唯一性调整,离线数据udf from dw_base.spark.udf.customs.common_clean import clean_company_name ind_head = [ 'M S', 'MS' ] india_suffix_list = [ ' CO I PVT L', ' CO PVT L', ' CO PRIVATE L', ' CO I LTD', ' I LTD', ' I LIMITED', ' I PVT L', ' I PRIVATE L', ' COMPANY PRIVATE L', ' COMPANY PVT L', ' P LTD', ' PRIVATE L', ' PVT L', ' CO', ' INC', ' CO LIMITED', ' LTD', ' LIMITED', ' CO I', ' I' ] def generate_md5_hash(input_str: str): input_data = input_str.encode('utf-8') md5_hash = hashlib.md5() md5_hash.update(input_data) return md5_hash.hexdigest() def generate_tid_ind(company_name: str, business_number: str) -> str or None: if not company_name: return None if business_number: input_str = business_number + 'AAA' else: input_str = company_name + 'BBB' return 'IND' + generate_md5_hash(input_str) def clean_company_name_ind(company_name: str) -> str or None: if company_name: bak_name = company_name.upper() company_name = clean_company_name(bak_name) for head in ind_head: if company_name.startswith(head): company_name = remove_prefix(company_name, head) break truncated_name = india_truncate_at_suffix(company_name, india_suffix_list) if (len(truncated_name.strip()) < 8): return clean_company_name(bak_name) else: return truncated_name.strip() return None def india_truncate_at_suffix(text, suffix_list): for suffix in suffix_list: if suffix in text: if ( suffix != ' CO' and suffix != ' INC' and suffix != ' CO LIMITED' and suffix != ' LTD' and suffix != ' LIMITED' and suffix != ' CO I' and suffix != ' I' ): return split_last(text, suffix) elif suffix == ' CO' and text.endswith(' CO'): return split_last(text, suffix) elif suffix == ' INC' and text.endswith(' INC'): return split_last(text, suffix) elif suffix == ' CO LIMITED' and ' AND CO LIMITED' not in text: return split_last(text, suffix) elif suffix == ' LTD' and text.endswith(' LTD'): return split_last(text, suffix) elif suffix == ' LIMITED' and text.endswith(' LIMITED'): return split_last(text, suffix) elif suffix == ' CO I' and text.endswith(' CO I'): return split_last(text, suffix) elif suffix == ' I' and text.endswith(' I'): return split_last(text, suffix) return text def split_last(text, suffix): if text: last_occurrence_index = text.rfind(suffix) if last_occurrence_index != -1: return text[:last_occurrence_index] return text return None def remove_prefix(text, prefix): if text.startswith(prefix): return text[len(prefix):] return text if __name__ == '__main__': name = 'P.T.ACEH KIAT BEUTARI JL INDONESI' print(clean_company_name_ind(name)) pass