ent_offline_udf_india.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #!/usr/bin/env /usr/bin/python3
  2. # -*- coding:utf-8 -*-
  3. import hashlib
  4. # 企业库唯一性调整,离线数据udf
  5. from dw_base.spark.udf.customs.common_clean import clean_company_name
  6. ind_head = [
  7. 'M S',
  8. 'MS'
  9. ]
  10. india_suffix_list = [
  11. ' CO I PVT L',
  12. ' CO PVT L',
  13. ' CO PRIVATE L',
  14. ' CO I LTD',
  15. ' I LTD',
  16. ' I LIMITED',
  17. ' I PVT L',
  18. ' I PRIVATE L',
  19. ' COMPANY PRIVATE L',
  20. ' COMPANY PVT L',
  21. ' P LTD',
  22. ' PRIVATE L',
  23. ' PVT L',
  24. ' CO',
  25. ' INC',
  26. ' CO LIMITED',
  27. ' LTD',
  28. ' LIMITED',
  29. ' CO I',
  30. ' I'
  31. ]
  32. def generate_md5_hash(input_str: str):
  33. input_data = input_str.encode('utf-8')
  34. md5_hash = hashlib.md5()
  35. md5_hash.update(input_data)
  36. return md5_hash.hexdigest()
  37. def generate_tid_ind(company_name: str,
  38. business_number: str) -> str or None:
  39. if not company_name:
  40. return None
  41. if business_number:
  42. input_str = business_number + 'AAA'
  43. else:
  44. input_str = company_name + 'BBB'
  45. return 'IND' + generate_md5_hash(input_str)
  46. def clean_company_name_ind(company_name: str) -> str or None:
  47. if company_name:
  48. bak_name = company_name.upper()
  49. company_name = clean_company_name(bak_name)
  50. for head in ind_head:
  51. if company_name.startswith(head):
  52. company_name = remove_prefix(company_name, head)
  53. break
  54. truncated_name = india_truncate_at_suffix(company_name, india_suffix_list)
  55. if (len(truncated_name.strip()) < 8):
  56. return clean_company_name(bak_name)
  57. else:
  58. return truncated_name.strip()
  59. return None
  60. def india_truncate_at_suffix(text, suffix_list):
  61. for suffix in suffix_list:
  62. if suffix in text:
  63. if (
  64. suffix != ' CO' and suffix != ' INC' and suffix != ' CO LIMITED' and suffix != ' LTD'
  65. and suffix != ' LIMITED' and suffix != ' CO I' and suffix != ' I'
  66. ):
  67. return split_last(text, suffix)
  68. elif suffix == ' CO' and text.endswith(' CO'):
  69. return split_last(text, suffix)
  70. elif suffix == ' INC' and text.endswith(' INC'):
  71. return split_last(text, suffix)
  72. elif suffix == ' CO LIMITED' and ' AND CO LIMITED' not in text:
  73. return split_last(text, suffix)
  74. elif suffix == ' LTD' and text.endswith(' LTD'):
  75. return split_last(text, suffix)
  76. elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
  77. return split_last(text, suffix)
  78. elif suffix == ' CO I' and text.endswith(' CO I'):
  79. return split_last(text, suffix)
  80. elif suffix == ' I' and text.endswith(' I'):
  81. return split_last(text, suffix)
  82. return text
  83. def split_last(text, suffix):
  84. if text:
  85. last_occurrence_index = text.rfind(suffix)
  86. if last_occurrence_index != -1:
  87. return text[:last_occurrence_index]
  88. return text
  89. return None
  90. def remove_prefix(text, prefix):
  91. if text.startswith(prefix):
  92. return text[len(prefix):]
  93. return text
  94. if __name__ == '__main__':
  95. name = 'P.T.ACEH KIAT BEUTARI JL INDONESI'
  96. print(clean_company_name_ind(name))
  97. pass