| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- #!/usr/bin/env /usr/bin/python3
- # -*- coding:utf-8 -*-
- import hashlib
- import re
- # 企业库唯一性调整,离线数据udf
- from datetime import datetime
- from dw_base.spark.udf.customs.common_clean import clean_company_name
- def generate_md5_hash(input_str: str):
- input_data = input_str.encode('utf-8')
- md5_hash = hashlib.md5()
- md5_hash.update(input_data)
- return md5_hash.hexdigest()
- def generate_tid_usa(company_name: str,
- business_number: str,
- state: str) -> str or None:
- if not company_name:
- return None
- if business_number:
- input_str = business_number + 'AAA'
- else:
- if state:
- input_str = f"{company_name}-{state}BBB"
- else:
- input_str = company_name + 'CCC'
- return 'USA' + generate_md5_hash(input_str)
- def clean_company_name_extra(s: str) -> str or None:
- if s:
- suffixes = ["INCORPORATED",
- "LIMITED LIABILITY COMPANY",
- "PUBLIC LIMITED COMPANY",
- "LIMITED LIABILITY PARTNERSHIP",
- "LIMITED PARTNERSHIP",
- "GENERAL PARTNERSHIP",
- "PROFESSIONAL CORPORATION",
- "NON PROFIT ORGANIZATION",
- "S CORPORATION",
- "BENEFIT CORPORATION",
- "DOING BUSINESS AS",
- "COMPANY LIMITE",
- "CORPORATION",
- "COMPANY",
- "LIMITED",
- "S CORP",
- "B CORP",
- "CO LTD",
- "INC",
- "LLC",
- "CORP",
- "CO",
- "LTD",
- "PLC",
- "LLP",
- "LP",
- "GP",
- "PC",
- "NPO",
- "DBA"]
- # 去除后缀
- for suffix in suffixes:
- if s.endswith(suffix):
- s = s[:-len(suffix)]
- break
- # 去除字符串前后的空格
- s = s.strip()
- return s
- def clean_company_name_usa(company_name: str) -> str or None:
- if company_name:
- name = clean_company_name(company_name)
- if name:
- name = clean_company_name_extra(name)
- return name
- return None
- state_abbr_to_full = {
- "FL": "Florida", "Fla.": "Florida",
- "GA": "Georgia", "Ga.": "Georgia",
- "HI": "Hawaii",
- "ID": "Idaho",
- "IL": "Illinois", "Ill.": "Illinois",
- "IN": "Indiana", "Ind.": "Indiana",
- "IA": "Iowa",
- "KS": "Kansas", "Kan.": "Kansas",
- "KY": "Kentucky", "Ky.": "Kentucky",
- "LA": "Louisiana", "La.": "Louisiana",
- "ME": "Maine",
- "MD": "Maryland", "Md.": "Maryland",
- "MA": "Massachusetts", "Mass.": "Massachusetts",
- "MI": "Michigan", "Mich.": "Michigan",
- "MN": "Minnesota", "Minn.": "Minnesota",
- "MS": "Mississippi", "Miss.": "Mississippi",
- "MO": "Missouri", "Mo.": "Missouri",
- "MT": "Montana", "Mont.": "Montana",
- "NE": "Nebraska", "Neb.": "Nebraska",
- "NV": "Nevada", "Nev.": "Nevada",
- "NH": "New Hampshire", "N.H.": "New Hampshire",
- "NJ": "New Jersey", "N.J.": "New Jersey",
- "NM": "New Mexico", "N.M.": "New Mexico",
- "NY": "New York", "N.Y.": "New York",
- "NC": "North Carolina", "N.C.": "North Carolina",
- "ND": "North Dakota", "N.D.": "North Dakota",
- "OH": "Ohio",
- "OK": "Oklahoma", "Okla.": "Oklahoma",
- "OR": "Oregon", "Ore.": "Oregon",
- "PA": "Pennsylvania", "Pa.": "Pennsylvania",
- "RI": "Rhode Island", "R.I.": "Rhode Island",
- "SC": "South Carolina", "S.C.": "South Carolina",
- "SD": "South Dakota", "S.D.": "South Dakota",
- "TN": "Tennessee", "Tenn.": "Tennessee",
- "TX": "Texas", "Tex.": "Texas",
- "UT": "Utah",
- "VT": "Vermont", "Vt.": "Vermont",
- "VA": "Virginia", "Va.": "Virginia",
- "WA": "Washington", "Wash.": "Washington",
- "WV": "West Virginia", "W.Va.": "West Virginia",
- "WI": "Wisconsin", "Wis.": "Wisconsin",
- "WY": "Wyoming", "Wyo.": "Wyoming"
- }
- search_terms = list(state_abbr_to_full.keys()) + list(state_abbr_to_full.values())
- # def get_country_state(address: str) -> str or None:
- # if address:
- # # for term in search_terms:
- # # if term.upper() in address:
- # # return state_abbr_to_full.get(term, term)
- # # return None
- # address_upper = address.upper()
- # for abbr, full_name in state_abbr_to_full.items():
- # abbr_upper = abbr.upper()
- # full_name_upper = full_name.upper()
- # abbr_pattern = r'(?:\s|^)' + re.escape(abbr_upper) + r'(?:\s|\.|,|;|$|$)'
- # full_name_pattern = r'(?:\s|^)' + re.escape(full_name_upper) + r'(?:\s|\.|,|;|$|$)'
- #
- # if re.search(full_name_pattern, address_upper):
- # return full_name
- # elif re.search(abbr_pattern, address_upper):
- # return full_name
- # return None
- # else:
- # return None
- def get_country_state(address: str) -> str or None:
- if not address:
- return None
- address_upper = address.upper()
- patterns = {}
- for abbr, full_name in state_abbr_to_full.items():
- abbr_upper = abbr.upper()
- full_name_upper = full_name.upper()
- patterns[abbr_upper] = re.compile(r'(?:\s|^)' + re.escape(abbr_upper) + r'(?:\s|\.|,|;|$|$)')
- patterns[full_name_upper] = re.compile(r'(?:\s|^)' + re.escape(full_name_upper) + r'(?:\s|\.|,|;|$|$)')
- for name_upper, pattern in patterns.items():
- if pattern.search(address_upper):
- for abbr, full_name in state_abbr_to_full.items():
- if name_upper == abbr.upper() or name_upper == full_name.upper():
- return full_name
- return None
- if __name__ == '__main__':
- name = ' 326 GRAND ST9735535523PATERSON NJ 07505'
- print(get_country_state(name))
- pass
|