#!/usr/bin/env /usr/bin/python3 # -*- coding:utf-8 -*- import hashlib import re # 企业库唯一性调整,离线数据udf from datetime import datetime from dw_base.spark.udf.customs.common_clean import clean_company_name def generate_md5_hash(input_str: str): input_data = input_str.encode('utf-8') md5_hash = hashlib.md5() md5_hash.update(input_data) return md5_hash.hexdigest() def generate_tid_usa(company_name: str, business_number: str, state: str) -> str or None: if not company_name: return None if business_number: input_str = business_number + 'AAA' else: if state: input_str = f"{company_name}-{state}BBB" else: input_str = company_name + 'CCC' return 'USA' + generate_md5_hash(input_str) def clean_company_name_extra(s: str) -> str or None: if s: suffixes = ["INCORPORATED", "LIMITED LIABILITY COMPANY", "PUBLIC LIMITED COMPANY", "LIMITED LIABILITY PARTNERSHIP", "LIMITED PARTNERSHIP", "GENERAL PARTNERSHIP", "PROFESSIONAL CORPORATION", "NON PROFIT ORGANIZATION", "S CORPORATION", "BENEFIT CORPORATION", "DOING BUSINESS AS", "COMPANY LIMITE", "CORPORATION", "COMPANY", "LIMITED", "S CORP", "B CORP", "CO LTD", "INC", "LLC", "CORP", "CO", "LTD", "PLC", "LLP", "LP", "GP", "PC", "NPO", "DBA"] # 去除后缀 for suffix in suffixes: if s.endswith(suffix): s = s[:-len(suffix)] break # 去除字符串前后的空格 s = s.strip() return s def clean_company_name_usa(company_name: str) -> str or None: if company_name: name = clean_company_name(company_name) if name: name = clean_company_name_extra(name) return name return None state_abbr_to_full = { "FL": "Florida", "Fla.": "Florida", "GA": "Georgia", "Ga.": "Georgia", "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "Ill.": "Illinois", "IN": "Indiana", "Ind.": "Indiana", "IA": "Iowa", "KS": "Kansas", "Kan.": "Kansas", "KY": "Kentucky", "Ky.": "Kentucky", "LA": "Louisiana", "La.": "Louisiana", "ME": "Maine", "MD": "Maryland", "Md.": "Maryland", "MA": "Massachusetts", "Mass.": "Massachusetts", "MI": "Michigan", "Mich.": "Michigan", "MN": "Minnesota", "Minn.": "Minnesota", "MS": "Mississippi", "Miss.": "Mississippi", "MO": "Missouri", "Mo.": "Missouri", "MT": "Montana", "Mont.": "Montana", "NE": "Nebraska", "Neb.": "Nebraska", "NV": "Nevada", "Nev.": "Nevada", "NH": "New Hampshire", "N.H.": "New Hampshire", "NJ": "New Jersey", "N.J.": "New Jersey", "NM": "New Mexico", "N.M.": "New Mexico", "NY": "New York", "N.Y.": "New York", "NC": "North Carolina", "N.C.": "North Carolina", "ND": "North Dakota", "N.D.": "North Dakota", "OH": "Ohio", "OK": "Oklahoma", "Okla.": "Oklahoma", "OR": "Oregon", "Ore.": "Oregon", "PA": "Pennsylvania", "Pa.": "Pennsylvania", "RI": "Rhode Island", "R.I.": "Rhode Island", "SC": "South Carolina", "S.C.": "South Carolina", "SD": "South Dakota", "S.D.": "South Dakota", "TN": "Tennessee", "Tenn.": "Tennessee", "TX": "Texas", "Tex.": "Texas", "UT": "Utah", "VT": "Vermont", "Vt.": "Vermont", "VA": "Virginia", "Va.": "Virginia", "WA": "Washington", "Wash.": "Washington", "WV": "West Virginia", "W.Va.": "West Virginia", "WI": "Wisconsin", "Wis.": "Wisconsin", "WY": "Wyoming", "Wyo.": "Wyoming" } search_terms = list(state_abbr_to_full.keys()) + list(state_abbr_to_full.values()) # def get_country_state(address: str) -> str or None: # if address: # # for term in search_terms: # # if term.upper() in address: # # return state_abbr_to_full.get(term, term) # # return None # address_upper = address.upper() # for abbr, full_name in state_abbr_to_full.items(): # abbr_upper = abbr.upper() # full_name_upper = full_name.upper() # abbr_pattern = r'(?:\s|^)' + re.escape(abbr_upper) + r'(?:\s|\.|,|;|$|$)' # full_name_pattern = r'(?:\s|^)' + re.escape(full_name_upper) + r'(?:\s|\.|,|;|$|$)' # # if re.search(full_name_pattern, address_upper): # return full_name # elif re.search(abbr_pattern, address_upper): # return full_name # return None # else: # return None def get_country_state(address: str) -> str or None: if not address: return None address_upper = address.upper() patterns = {} for abbr, full_name in state_abbr_to_full.items(): abbr_upper = abbr.upper() full_name_upper = full_name.upper() patterns[abbr_upper] = re.compile(r'(?:\s|^)' + re.escape(abbr_upper) + r'(?:\s|\.|,|;|$|$)') patterns[full_name_upper] = re.compile(r'(?:\s|^)' + re.escape(full_name_upper) + r'(?:\s|\.|,|;|$|$)') for name_upper, pattern in patterns.items(): if pattern.search(address_upper): for abbr, full_name in state_abbr_to_full.items(): if name_upper == abbr.upper() or name_upper == full_name.upper(): return full_name return None if __name__ == '__main__': name = ' 326 GRAND ST9735535523PATERSON NJ 07505' print(get_country_state(name)) pass