ent_offline_udf_indonesia.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. #!/usr/bin/env /usr/bin/python3
  2. # -*- coding:utf-8 -*-
  3. import hashlib
  4. # 企业库唯一性调整,离线数据udf
  5. from datetime import datetime
  6. from dw_base.spark.udf.customs.common_clean import clean_company_name
  7. def generate_md5_hash(input_str: str):
  8. input_data = input_str.encode('utf-8')
  9. md5_hash = hashlib.md5()
  10. md5_hash.update(input_data)
  11. return md5_hash.hexdigest()
  12. def generate_tid_idn(company_name: str,
  13. business_number: str,
  14. city: str) -> str or None:
  15. if not company_name:
  16. return None
  17. if business_number:
  18. if city:
  19. input_str = f"{business_number}-{city}AAA"
  20. else:
  21. input_str = business_number + 'BBB'
  22. else:
  23. input_str = company_name + 'CCC'
  24. return 'IDN' + generate_md5_hash(input_str)
  25. def clean_company_name_extra(s: str) -> str or None:
  26. if s:
  27. prefixes = ['PT', 'PT.', 'CV', 'CV.']
  28. suffixes = ['PT', ',PT', 'CV', ',CV']
  29. # 去除前缀
  30. for prefix in prefixes:
  31. if s.startswith(prefix):
  32. s = s[len(prefix):]
  33. break
  34. # 去除后缀
  35. for suffix in suffixes:
  36. if s.endswith(suffix):
  37. s = s[:-len(suffix)]
  38. break
  39. # 截断字符:如果DI前后有空格,就把DI及后面的字符截掉
  40. if ' DI ' in s:
  41. s = s[:s.index(' DI ')]
  42. # 去除字符串前后的空格
  43. s = s.strip()
  44. return s
  45. def clean_company_name_idn(company_name: str) -> str or None:
  46. if company_name:
  47. name = clean_company_name(company_name)
  48. if name:
  49. name = clean_company_name_extra(name)
  50. return name
  51. return None
  52. def get_standard_company_name(s: str) -> str:
  53. s = s.strip()
  54. if s.upper().startswith('PT ') or s.upper().startswith('PT.'):
  55. s = 'PT.' + s[3:].strip()
  56. elif s.upper().startswith('P.T.') or s.upper().startswith('P T '):
  57. s = 'PT.' + s[4:].strip()
  58. elif s.upper().startswith('CV ') or s.upper().startswith('CV.'):
  59. s = 'CV.' + s[3:].strip()
  60. else:
  61. s = 'PT.' + s
  62. # 检查并去除后缀
  63. suffixes_to_remove = ['., PT', ', PT', ',PT', 'PT', ',CV', 'CV']
  64. for suffix in suffixes_to_remove:
  65. if s.upper().endswith(suffix):
  66. s = s[:-len(suffix)].strip()
  67. break
  68. return s
  69. if __name__ == '__main__':
  70. name = 'P.T.ACEH KIAT BEUTARI JL INDONESI'
  71. print(get_standard_company_name(name))
  72. pass