ent_offline_udf_turkey.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #!/usr/bin/env /usr/bin/python3
  2. # -*- coding:utf-8 -*-
  3. import hashlib
  4. import re
  5. # 企业库唯一性调整,离线数据udf
  6. from datetime import datetime
  7. from dw_base.spark.udf.customs.common_clean import clean_company_name
  8. turkey_replace_dict = {
  9. 'ç': 'c', 'Ç': 'C',
  10. 'ğ': 'g', 'Ğ': 'G',
  11. 'ı': 'i', 'İ': 'I',
  12. 'ö': 'o', 'Ö': 'O',
  13. 'ş': 's', 'Ş': 'S',
  14. 'ü': 'u', 'Ü': 'U'
  15. }
  16. def replace_str_english(text: str) -> str or None:
  17. if text:
  18. return text.translate(str.maketrans(turkey_replace_dict))
  19. return None
  20. def generate_md5_hash(input_str: str):
  21. input_data = input_str.encode('utf-8')
  22. md5_hash = hashlib.md5()
  23. md5_hash.update(input_data)
  24. return md5_hash.hexdigest()
  25. def generate_tid_tur(company_name: str,
  26. business_number: str) -> str or None:
  27. if not company_name:
  28. return None
  29. if business_number:
  30. input_str = business_number + 'AAA'
  31. else:
  32. input_str = company_name + 'BBB'
  33. return 'TUR' + generate_md5_hash(input_str)
  34. def clean_company_name_extra(s: str) -> str or None:
  35. if s:
  36. suffixes = [
  37. "Tahmini Anonim Sirket",
  38. "Anonim Sirket",
  39. "Halka Acık Sirket",
  40. "Komandit Sirket",
  41. "Limited Sirket",
  42. "Baslangıc Sirket",
  43. "Tahmini Anonim Sirketi",
  44. "Anonim Sirketi",
  45. "Halka Acık Sirketi",
  46. "Komandit Sirketi",
  47. "Limited Sirketi",
  48. "Baslangıc Sirketi",
  49. " LTD Sti",
  50. " T A S",
  51. " A S",
  52. " H S",
  53. " K S",
  54. " B S"]
  55. # 去除后缀
  56. for suffix in suffixes:
  57. if s.endswith(suffix.upper()):
  58. s = s[:-len(suffix)]
  59. break
  60. # 去除字符串前后的空格
  61. s = s.strip()
  62. return s
  63. def clean_company_name_tur(company_name: str) -> str or None:
  64. if company_name:
  65. company_name = replace_str_english(company_name)
  66. name = clean_company_name(company_name)
  67. if name:
  68. name = clean_company_name_extra(name)
  69. return name
  70. return None
  71. if __name__ == '__main__':
  72. name = 'KBR İNŞAAT METAL VE ELEKTRİK SANAYİ TİCARET LİMİTED ŞİRKETİ'
  73. print(clean_company_name_tur(name))
  74. pass