| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- #!/usr/bin/env /usr/bin/python3
- # -*- coding:utf-8 -*-
- import hashlib
- # 企业库唯一性调整,离线数据udf
- from dw_base.spark.udf.customs.common_clean import clean_company_name
- ind_head = [
- 'M S',
- 'MS'
- ]
- india_suffix_list = [
- ' CO I PVT L',
- ' CO PVT L',
- ' CO PRIVATE L',
- ' CO I LTD',
- ' I LTD',
- ' I LIMITED',
- ' I PVT L',
- ' I PRIVATE L',
- ' COMPANY PRIVATE L',
- ' COMPANY PVT L',
- ' P LTD',
- ' PRIVATE L',
- ' PVT L',
- ' CO',
- ' INC',
- ' CO LIMITED',
- ' LTD',
- ' LIMITED',
- ' CO I',
- ' I'
- ]
- def generate_md5_hash(input_str: str):
- input_data = input_str.encode('utf-8')
- md5_hash = hashlib.md5()
- md5_hash.update(input_data)
- return md5_hash.hexdigest()
- def generate_tid_ind(company_name: str,
- business_number: str) -> str or None:
- if not company_name:
- return None
- if business_number:
- input_str = business_number + 'AAA'
- else:
- input_str = company_name + 'BBB'
- return 'IND' + generate_md5_hash(input_str)
- def clean_company_name_ind(company_name: str) -> str or None:
- if company_name:
- bak_name = company_name.upper()
- company_name = clean_company_name(bak_name)
- for head in ind_head:
- if company_name.startswith(head):
- company_name = remove_prefix(company_name, head)
- break
- truncated_name = india_truncate_at_suffix(company_name, india_suffix_list)
- if (len(truncated_name.strip()) < 8):
- return clean_company_name(bak_name)
- else:
- return truncated_name.strip()
- return None
- def india_truncate_at_suffix(text, suffix_list):
- for suffix in suffix_list:
- if suffix in text:
- if (
- suffix != ' CO' and suffix != ' INC' and suffix != ' CO LIMITED' and suffix != ' LTD'
- and suffix != ' LIMITED' and suffix != ' CO I' and suffix != ' I'
- ):
- return split_last(text, suffix)
- elif suffix == ' CO' and text.endswith(' CO'):
- return split_last(text, suffix)
- elif suffix == ' INC' and text.endswith(' INC'):
- return split_last(text, suffix)
- elif suffix == ' CO LIMITED' and ' AND CO LIMITED' not in text:
- return split_last(text, suffix)
- elif suffix == ' LTD' and text.endswith(' LTD'):
- return split_last(text, suffix)
- elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
- return split_last(text, suffix)
- elif suffix == ' CO I' and text.endswith(' CO I'):
- return split_last(text, suffix)
- elif suffix == ' I' and text.endswith(' I'):
- return split_last(text, suffix)
- return text
- def split_last(text, suffix):
- if text:
- last_occurrence_index = text.rfind(suffix)
- if last_occurrence_index != -1:
- return text[:last_occurrence_index]
- return text
- return None
- def remove_prefix(text, prefix):
- if text.startswith(prefix):
- return text[len(prefix):]
- return text
- if __name__ == '__main__':
- name = 'P.T.ACEH KIAT BEUTARI JL INDONESI'
- print(clean_company_name_ind(name))
- pass
|