#!/usr/bin/env /usr/bin/python3 # -*- coding:utf-8 -*- import json import re from typing import List from pyspark.sql.functions import udf from pyspark.sql.types import * full_width_character = ['.', ',', '-', '(', ')', '@', '?', '‘', '’', '“', '”', '`', '#', '+', '!', '$', '|', ':', '/', ';', '*', '《', '》', '<', '>', '`', '#', '+', '!', '$', '|', ':', '/', ';', '*', '《', '》', '<', '>', '%', '^', '&', '_', '[', ']', '{', '}', '\\', '~', '=', "'", '±', '°', '«', '»', 'µ', '¶', '·', '€', '£', '¥', '¢', '×', '÷', '±', '¬', '…', '→', '←', '↑', '↓', '↔', '⇒', '⇐', '≈', '≠', '≤', '≥' ] half_width_character = [ '.', ',', '-', '(', ')', '@', '?', "'", "'", '"', '"', ''', '#', '+', '!', '$', '|', ':', '/', ';', '*', '<', '>', "'", '#', '+', '!', '$', '|', ':', '/', ';', '*', '<', '>', '%', '^', '&', '_', '[', ']', '{', '}', '\', '~', '=', "'", '±', '°', '«', '»', 'µ', '¶', '·', '€', '£', '¥', '¢', '×', '÷', '±', '¬', '…', '→', '←', '↑', '↓', '↔', '⇒', '⇐', '≈', '≠', '≤', '≥' ] tail_character = ['groupcompanylimited', 'limitedpartnership', 'corporationlimited', 'researchinstitute', 'liabilitycompany', 'limitedcompany', 'companylimited', 'youxiangongsi', 'incorporated', 'shanghaiinc', 'corporation', 'groupcoltd', 'companyltd', 'shlimited', 'colimited', 'groupltd', 'chinaltd', 'chinainc', 'factory', 'corpltd', 'company', 'ptyltd', 'agency', 'office', 'center', 'coltd', 'coinc', 'c0ltd', 'colt', 'corp', 'llc', 'ltd', 'co', ] chian_ent_label = [ 'shanghai', 'peking', 'chongqing', 'tianjin', 'wuhan', 'harbin', 'shenyang', 'guangzhou', 'chengdu', 'nanjing]', 'changchun', 'xian', 'dalian', 'qingdao', 'jinan', 'hangzhou', 'zhengzhou', 'shijiazhuang', 'taiyuan', 'kunming', 'changsha', 'nanchang', 'fuzhou', 'lanzhou', 'guiyang', 'ningbo', 'hefei', 'anshan', 'fushun', 'nanning', 'zibo', 'qiqihar', 'jilin', 'tangshan', 'baotou', 'shenzhen', 'hohhot', 'handan', 'wuxi', 'xuzhou', 'datong', 'yichun', 'benxi', 'luoyang', 'suzhou', 'xining', 'huainan', 'jixi', 'daqing', 'fuxin', 'xiamen', 'liuzhou', 'shantou', 'jinzhou', 'mudanjiang', 'yinchuan', 'changzhou', 'zhangjiakou', 'dandong', 'hegang', 'kaifeng', 'jiamusi', 'liaoyang', 'hengyang', 'baoding', 'hunjiang', 'xinxiang', 'huangshi', 'haikou', 'yantai', 'bengbu', 'xiangtan', 'weifang', 'wuhu', 'pingxiang', 'yingkou', 'anyang', 'panzhihua', 'pingdingshan', 'xiangfan', 'zhuzhou', 'jiaozuo', 'wenzhou', 'zhangjiang', 'zigong', 'shuangyashan', 'zaozhuang', 'yakeshi', 'yichang', 'zhenjiang', 'huaibei', 'qinhuangdao', 'guilin', 'liupanshui', 'panjin', 'yangquan', 'jinxi', 'liaoyuan', 'lianyungang', 'xianyang', 'tai´an', 'chifeng', 'shaoguan', 'nantong', 'leshan', 'baoji', 'linyi', 'tonghua', 'siping', 'changzhi', 'tengzhou', 'chaozhou', 'yangzhou', 'dongwan', 'ma´anshan', 'foshan', 'yueyang', 'xingtai', 'changde', 'shihezi', 'yancheng', 'jiujiang', 'dongying', 'shashi', 'xintai', 'jingdezhen', 'tongchuan', 'zhongshan', 'shiyan', 'tieli', 'jining', 'wuhai', 'mianyang', 'luzhou', 'zunyi', 'shizuishan', 'neijiang', 'tongliao', 'tieling', 'wafangdian', 'anqing', 'shaoyang', 'laiwu', 'chengde', 'tianshui', 'nanyang', 'cangzhou', 'yibin', 'huaiyin', 'dunhua', 'yanji', 'jiangmen', 'tongling', 'suihua', 'gongziling', 'xiantao', 'chaoyang', 'ganzhou', 'huzhou', 'baicheng', 'shangzi', 'yangjiang', 'qitaihe', 'gejiu', 'jiangyin', 'hebi', 'jiaxing', 'wuzhou', 'meihekou', 'xuchang', 'liaocheng', 'haicheng', 'qianjiang', 'baiyin', 'bei´an', 'yixing', 'laizhou', 'qaramay', 'acheng', 'dezhou', 'nanping', 'zhaoqing', 'beipiao', 'fengcheng', 'fuyu', 'xinyang', 'dongtai', 'yuci', 'honghu', 'ezhou', 'heze', 'daxian', 'linfen', 'tianmen', 'yiyang', 'quanzhou', 'rizhao', 'deyang', 'guangyuan', 'changshu', 'zhangzhou', 'hailar', 'nanchong', 'jiutai', 'zhaodong', 'shaoxing', 'fuyang', 'maoming', 'qujing', 'ghulja', 'jiaohe', 'puyang', 'huadian', 'jiangyou', 'qashqar', 'anshun', 'fuling', 'xinyu', 'hanzhong', 'danyang', 'chenzhou', 'xiaogan', 'shangqiu', 'zhuhai', 'qingyuan', 'aqsu', 'xiaoshan', 'zaoyang', 'xinghua', 'hami', 'huizhou', 'jinmen', 'sanming', 'ulanhot', 'korla', 'wanxian', 'ruian', 'zhoushan', 'liangcheng', 'jiaozhou', 'taizhou', 'taonan', 'pingdu', 'ji´an', 'longkou', 'langfang', 'zhoukou', 'suining', 'yulin', 'jinhua', 'liu´an', 'shuangcheng', 'suizhou', 'ankang', 'weinan', 'longjing', 'daan', 'lengshuijiang', 'laiyang', 'xianning', 'dali', 'anda', 'jincheng', 'longyan', 'xichang', 'wendeng', 'hailun', 'binzhou', 'linhe', 'wuwei', 'duyun', 'mishan', 'shangrao', 'changji', 'meixian', 'yushu', 'tiefa', 'huai´an', 'leiyang', 'zalantun', 'weihai', 'loudi', 'qingzhou', 'qidong', 'huaihua', 'luohe', 'chuzhou', 'kaiyuan', 'linqing', 'chaohu', 'laohekou', 'dujiangyan', 'zhumadian', 'linchuan', 'jiaonan', 'sanmenxia', 'heyuan', 'manzhouli', 'lhasa', 'lianyuan', 'kuytun', 'puqi', 'hongjiang', 'qinzhou', 'renqiu', 'yuyao', 'guigang', 'kaili', 'yan´an', 'beihai', 'xuangzhou', 'quzhou', 'yong´an', 'zixing', 'liyang', 'yizheng', 'yumen', 'liling', 'yuncheng', 'shanwei', 'cixi', 'yuanjiang', 'bozhou', 'jinchang', 'fuan', 'suqian', 'shishou', 'hengshui', 'danjiangkou', 'fujin', 'sanya', 'guangshui', 'huangshan', 'xingcheng', 'zhucheng', 'kunshan', 'haining', 'pingliang', 'fuqing', 'xinzhou', 'jieyang', 'zhangjiagang', 'tong xian', 'yaan', 'emeishan', 'enshi', 'bose', 'yuzhou', 'tumen', 'putian', 'linhai', 'shaowu', 'junan', 'huaying', 'pingyi', 'huangyan' ] brazil_tail_character_cut = [ 'industriais ltda', 'brasil indstria', 'e comercializacao', 'brasil ltda', 'industria', 'eireli', 'cia ltda', 'ind e com', 'brasil ltda epp', 'importacao', 'e comercio', 'comercio', # 'sa', 'do brasi', 'brasil sa', 'limitada', 'ltda me', 'ltda epp', 'ltda' ] brazil_tail_character_remove = [ 'sa', 'ltda', 'casa' ] def get_clean_eng_ent_name(eng_name: str) -> str or None: if eng_name: # eng_name = eng_name.lower() eng_name = eng_name.lower().replace(' ', '') for char in full_width_character: eng_name = re.sub(re.escape(char), '', eng_name) for char in half_width_character: eng_name = re.sub(re.escape(char), '', eng_name) return eng_name else: return '' def remove_tail_char(eng_name: str) -> str or None: if eng_name: for char in tail_character: if eng_name.endswith(char): return eng_name[:-len(char)] return eng_name else: return '' @udf(returnType=BooleanType()) def filter_china_ent(name_abb: str) -> bool: if name_abb: for char in chian_ent_label: if char in name_abb: return True return False def cut_tail_char_brazil(eng_name: str) -> str or None: if eng_name: for tail in brazil_tail_character_cut: pattern = re.compile(f'{tail}\s*', flags=re.IGNORECASE) match = re.search(pattern, eng_name) if match: ent_name_cut = eng_name[:match.start()].strip() if len(ent_name_cut) > 5: return ent_name_cut else: return eng_name return eng_name return '' def remove_punctuation(eng_name: str) -> str or None: if eng_name: eng_name = eng_name.lower() for char in full_width_character: eng_name = re.sub(re.escape(char), '', eng_name) for char in half_width_character: eng_name = re.sub(re.escape(char), '', eng_name) return eng_name else: return '' def remove_tail_char_brazil(eng_name: str) -> str or None: if eng_name: for char in brazil_tail_character_remove: if eng_name.endswith(char): return eng_name[:-len(char)].replace(' ', '') return eng_name.replace(' ', '') else: return '' if __name__ == '__main__': a = 'ABC ltda epp industriais ltdaltda me' print(remove_tail_char_brazil(a))