ent_offline_udf_america.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. #!/usr/bin/env /usr/bin/python3
  2. # -*- coding:utf-8 -*-
  3. import hashlib
  4. import re
  5. # 企业库唯一性调整,离线数据udf
  6. from datetime import datetime
  7. from dw_base.spark.udf.customs.common_clean import clean_company_name
  8. def generate_md5_hash(input_str: str):
  9. input_data = input_str.encode('utf-8')
  10. md5_hash = hashlib.md5()
  11. md5_hash.update(input_data)
  12. return md5_hash.hexdigest()
  13. def generate_tid_usa(company_name: str,
  14. business_number: str,
  15. state: str) -> str or None:
  16. if not company_name:
  17. return None
  18. if business_number:
  19. input_str = business_number + 'AAA'
  20. else:
  21. if state:
  22. input_str = f"{company_name}-{state}BBB"
  23. else:
  24. input_str = company_name + 'CCC'
  25. return 'USA' + generate_md5_hash(input_str)
  26. def clean_company_name_extra(s: str) -> str or None:
  27. if s:
  28. suffixes = ["INCORPORATED",
  29. "LIMITED LIABILITY COMPANY",
  30. "PUBLIC LIMITED COMPANY",
  31. "LIMITED LIABILITY PARTNERSHIP",
  32. "LIMITED PARTNERSHIP",
  33. "GENERAL PARTNERSHIP",
  34. "PROFESSIONAL CORPORATION",
  35. "NON PROFIT ORGANIZATION",
  36. "S CORPORATION",
  37. "BENEFIT CORPORATION",
  38. "DOING BUSINESS AS",
  39. "COMPANY LIMITE",
  40. "CORPORATION",
  41. "COMPANY",
  42. "LIMITED",
  43. "S CORP",
  44. "B CORP",
  45. "CO LTD",
  46. "INC",
  47. "LLC",
  48. "CORP",
  49. "CO",
  50. "LTD",
  51. "PLC",
  52. "LLP",
  53. "LP",
  54. "GP",
  55. "PC",
  56. "NPO",
  57. "DBA"]
  58. # 去除后缀
  59. for suffix in suffixes:
  60. if s.endswith(suffix):
  61. s = s[:-len(suffix)]
  62. break
  63. # 去除字符串前后的空格
  64. s = s.strip()
  65. return s
  66. def clean_company_name_usa(company_name: str) -> str or None:
  67. if company_name:
  68. name = clean_company_name(company_name)
  69. if name:
  70. name = clean_company_name_extra(name)
  71. return name
  72. return None
  73. state_abbr_to_full = {
  74. "FL": "Florida", "Fla.": "Florida",
  75. "GA": "Georgia", "Ga.": "Georgia",
  76. "HI": "Hawaii",
  77. "ID": "Idaho",
  78. "IL": "Illinois", "Ill.": "Illinois",
  79. "IN": "Indiana", "Ind.": "Indiana",
  80. "IA": "Iowa",
  81. "KS": "Kansas", "Kan.": "Kansas",
  82. "KY": "Kentucky", "Ky.": "Kentucky",
  83. "LA": "Louisiana", "La.": "Louisiana",
  84. "ME": "Maine",
  85. "MD": "Maryland", "Md.": "Maryland",
  86. "MA": "Massachusetts", "Mass.": "Massachusetts",
  87. "MI": "Michigan", "Mich.": "Michigan",
  88. "MN": "Minnesota", "Minn.": "Minnesota",
  89. "MS": "Mississippi", "Miss.": "Mississippi",
  90. "MO": "Missouri", "Mo.": "Missouri",
  91. "MT": "Montana", "Mont.": "Montana",
  92. "NE": "Nebraska", "Neb.": "Nebraska",
  93. "NV": "Nevada", "Nev.": "Nevada",
  94. "NH": "New Hampshire", "N.H.": "New Hampshire",
  95. "NJ": "New Jersey", "N.J.": "New Jersey",
  96. "NM": "New Mexico", "N.M.": "New Mexico",
  97. "NY": "New York", "N.Y.": "New York",
  98. "NC": "North Carolina", "N.C.": "North Carolina",
  99. "ND": "North Dakota", "N.D.": "North Dakota",
  100. "OH": "Ohio",
  101. "OK": "Oklahoma", "Okla.": "Oklahoma",
  102. "OR": "Oregon", "Ore.": "Oregon",
  103. "PA": "Pennsylvania", "Pa.": "Pennsylvania",
  104. "RI": "Rhode Island", "R.I.": "Rhode Island",
  105. "SC": "South Carolina", "S.C.": "South Carolina",
  106. "SD": "South Dakota", "S.D.": "South Dakota",
  107. "TN": "Tennessee", "Tenn.": "Tennessee",
  108. "TX": "Texas", "Tex.": "Texas",
  109. "UT": "Utah",
  110. "VT": "Vermont", "Vt.": "Vermont",
  111. "VA": "Virginia", "Va.": "Virginia",
  112. "WA": "Washington", "Wash.": "Washington",
  113. "WV": "West Virginia", "W.Va.": "West Virginia",
  114. "WI": "Wisconsin", "Wis.": "Wisconsin",
  115. "WY": "Wyoming", "Wyo.": "Wyoming"
  116. }
  117. search_terms = list(state_abbr_to_full.keys()) + list(state_abbr_to_full.values())
  118. # def get_country_state(address: str) -> str or None:
  119. # if address:
  120. # # for term in search_terms:
  121. # # if term.upper() in address:
  122. # # return state_abbr_to_full.get(term, term)
  123. # # return None
  124. # address_upper = address.upper()
  125. # for abbr, full_name in state_abbr_to_full.items():
  126. # abbr_upper = abbr.upper()
  127. # full_name_upper = full_name.upper()
  128. # abbr_pattern = r'(?:\s|^)' + re.escape(abbr_upper) + r'(?:\s|\.|,|;|$|$)'
  129. # full_name_pattern = r'(?:\s|^)' + re.escape(full_name_upper) + r'(?:\s|\.|,|;|$|$)'
  130. #
  131. # if re.search(full_name_pattern, address_upper):
  132. # return full_name
  133. # elif re.search(abbr_pattern, address_upper):
  134. # return full_name
  135. # return None
  136. # else:
  137. # return None
  138. def get_country_state(address: str) -> str or None:
  139. if not address:
  140. return None
  141. address_upper = address.upper()
  142. patterns = {}
  143. for abbr, full_name in state_abbr_to_full.items():
  144. abbr_upper = abbr.upper()
  145. full_name_upper = full_name.upper()
  146. patterns[abbr_upper] = re.compile(r'(?:\s|^)' + re.escape(abbr_upper) + r'(?:\s|\.|,|;|$|$)')
  147. patterns[full_name_upper] = re.compile(r'(?:\s|^)' + re.escape(full_name_upper) + r'(?:\s|\.|,|;|$|$)')
  148. for name_upper, pattern in patterns.items():
  149. if pattern.search(address_upper):
  150. for abbr, full_name in state_abbr_to_full.items():
  151. if name_upper == abbr.upper() or name_upper == full_name.upper():
  152. return full_name
  153. return None
  154. if __name__ == '__main__':
  155. name = ' 326 GRAND ST9735535523PATERSON NJ 07505'
  156. print(get_country_state(name))
  157. pass