| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666 |
- #!/usr/bin/env /usr/bin/python3
- # -*- coding:utf-8 -*-
- import json
- import re
- from typing import List
- from pyspark.sql.functions import udf
- from pyspark.sql.types import *
- full_width_character = ['.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '‘',
- '’',
- '“',
- '”',
- '`',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '《',
- '》',
- '<',
- '>',
- '`',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '《',
- '》',
- '<',
- '>',
- '%',
- '^',
- '&',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\\',
- '~',
- '=',
- "'",
- '±',
- '°',
- '«',
- '»',
- 'µ',
- '¶',
- '·',
- '€',
- '£',
- '¥',
- '¢',
- '×',
- '÷',
- '±',
- '¬',
- '…',
- '→',
- '←',
- '↑',
- '↓',
- '↔',
- '⇒',
- '⇐',
- '≈',
- '≠',
- '≤',
- '≥'
- ]
- half_width_character = [
- '.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- "'",
- "'",
- '"',
- '"',
- ''',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '<',
- '>',
- "'",
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '<',
- '>',
- '%',
- '^',
- '&',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\',
- '~',
- '=',
- "'",
- '±',
- '°',
- '«',
- '»',
- 'µ',
- '¶',
- '·',
- '€',
- '£',
- '¥',
- '¢',
- '×',
- '÷',
- '±',
- '¬',
- '…',
- '→',
- '←',
- '↑',
- '↓',
- '↔',
- '⇒',
- '⇐',
- '≈',
- '≠',
- '≤',
- '≥'
- ]
- tail_character = ['groupcompanylimited',
- 'limitedpartnership',
- 'corporationlimited',
- 'researchinstitute',
- 'liabilitycompany',
- 'limitedcompany',
- 'companylimited',
- 'youxiangongsi',
- 'incorporated',
- 'shanghaiinc',
- 'corporation',
- 'groupcoltd',
- 'companyltd',
- 'shlimited',
- 'colimited',
- 'groupltd',
- 'chinaltd',
- 'chinainc',
- 'factory',
- 'corpltd',
- 'company',
- 'ptyltd',
- 'agency',
- 'office',
- 'center',
- 'coltd',
- 'coinc',
- 'c0ltd',
- 'colt',
- 'corp',
- 'llc',
- 'ltd',
- 'co',
- ]
- chian_ent_label = [
- 'shanghai',
- 'peking',
- 'chongqing',
- 'tianjin',
- 'wuhan',
- 'harbin',
- 'shenyang',
- 'guangzhou',
- 'chengdu',
- 'nanjing]',
- 'changchun',
- 'xian',
- 'dalian',
- 'qingdao',
- 'jinan',
- 'hangzhou',
- 'zhengzhou',
- 'shijiazhuang',
- 'taiyuan',
- 'kunming',
- 'changsha',
- 'nanchang',
- 'fuzhou',
- 'lanzhou',
- 'guiyang',
- 'ningbo',
- 'hefei',
- 'anshan',
- 'fushun',
- 'nanning',
- 'zibo',
- 'qiqihar',
- 'jilin',
- 'tangshan',
- 'baotou',
- 'shenzhen',
- 'hohhot',
- 'handan',
- 'wuxi',
- 'xuzhou',
- 'datong',
- 'yichun',
- 'benxi',
- 'luoyang',
- 'suzhou',
- 'xining',
- 'huainan',
- 'jixi',
- 'daqing',
- 'fuxin',
- 'xiamen',
- 'liuzhou',
- 'shantou',
- 'jinzhou',
- 'mudanjiang',
- 'yinchuan',
- 'changzhou',
- 'zhangjiakou',
- 'dandong',
- 'hegang',
- 'kaifeng',
- 'jiamusi',
- 'liaoyang',
- 'hengyang',
- 'baoding',
- 'hunjiang',
- 'xinxiang',
- 'huangshi',
- 'haikou',
- 'yantai',
- 'bengbu',
- 'xiangtan',
- 'weifang',
- 'wuhu',
- 'pingxiang',
- 'yingkou',
- 'anyang',
- 'panzhihua',
- 'pingdingshan',
- 'xiangfan',
- 'zhuzhou',
- 'jiaozuo',
- 'wenzhou',
- 'zhangjiang',
- 'zigong',
- 'shuangyashan',
- 'zaozhuang',
- 'yakeshi',
- 'yichang',
- 'zhenjiang',
- 'huaibei',
- 'qinhuangdao',
- 'guilin',
- 'liupanshui',
- 'panjin',
- 'yangquan',
- 'jinxi',
- 'liaoyuan',
- 'lianyungang',
- 'xianyang',
- 'tai´an',
- 'chifeng',
- 'shaoguan',
- 'nantong',
- 'leshan',
- 'baoji',
- 'linyi',
- 'tonghua',
- 'siping',
- 'changzhi',
- 'tengzhou',
- 'chaozhou',
- 'yangzhou',
- 'dongwan',
- 'ma´anshan',
- 'foshan',
- 'yueyang',
- 'xingtai',
- 'changde',
- 'shihezi',
- 'yancheng',
- 'jiujiang',
- 'dongying',
- 'shashi',
- 'xintai',
- 'jingdezhen',
- 'tongchuan',
- 'zhongshan',
- 'shiyan',
- 'tieli',
- 'jining',
- 'wuhai',
- 'mianyang',
- 'luzhou',
- 'zunyi',
- 'shizuishan',
- 'neijiang',
- 'tongliao',
- 'tieling',
- 'wafangdian',
- 'anqing',
- 'shaoyang',
- 'laiwu',
- 'chengde',
- 'tianshui',
- 'nanyang',
- 'cangzhou',
- 'yibin',
- 'huaiyin',
- 'dunhua',
- 'yanji',
- 'jiangmen',
- 'tongling',
- 'suihua',
- 'gongziling',
- 'xiantao',
- 'chaoyang',
- 'ganzhou',
- 'huzhou',
- 'baicheng',
- 'shangzi',
- 'yangjiang',
- 'qitaihe',
- 'gejiu',
- 'jiangyin',
- 'hebi',
- 'jiaxing',
- 'wuzhou',
- 'meihekou',
- 'xuchang',
- 'liaocheng',
- 'haicheng',
- 'qianjiang',
- 'baiyin',
- 'bei´an',
- 'yixing',
- 'laizhou',
- 'qaramay',
- 'acheng',
- 'dezhou',
- 'nanping',
- 'zhaoqing',
- 'beipiao',
- 'fengcheng',
- 'fuyu',
- 'xinyang',
- 'dongtai',
- 'yuci',
- 'honghu',
- 'ezhou',
- 'heze',
- 'daxian',
- 'linfen',
- 'tianmen',
- 'yiyang',
- 'quanzhou',
- 'rizhao',
- 'deyang',
- 'guangyuan',
- 'changshu',
- 'zhangzhou',
- 'hailar',
- 'nanchong',
- 'jiutai',
- 'zhaodong',
- 'shaoxing',
- 'fuyang',
- 'maoming',
- 'qujing',
- 'ghulja',
- 'jiaohe',
- 'puyang',
- 'huadian',
- 'jiangyou',
- 'qashqar',
- 'anshun',
- 'fuling',
- 'xinyu',
- 'hanzhong',
- 'danyang',
- 'chenzhou',
- 'xiaogan',
- 'shangqiu',
- 'zhuhai',
- 'qingyuan',
- 'aqsu',
- 'xiaoshan',
- 'zaoyang',
- 'xinghua',
- 'hami',
- 'huizhou',
- 'jinmen',
- 'sanming',
- 'ulanhot',
- 'korla',
- 'wanxian',
- 'ruian',
- 'zhoushan',
- 'liangcheng',
- 'jiaozhou',
- 'taizhou',
- 'taonan',
- 'pingdu',
- 'ji´an',
- 'longkou',
- 'langfang',
- 'zhoukou',
- 'suining',
- 'yulin',
- 'jinhua',
- 'liu´an',
- 'shuangcheng',
- 'suizhou',
- 'ankang',
- 'weinan',
- 'longjing',
- 'daan',
- 'lengshuijiang',
- 'laiyang',
- 'xianning',
- 'dali',
- 'anda',
- 'jincheng',
- 'longyan',
- 'xichang',
- 'wendeng',
- 'hailun',
- 'binzhou',
- 'linhe',
- 'wuwei',
- 'duyun',
- 'mishan',
- 'shangrao',
- 'changji',
- 'meixian',
- 'yushu',
- 'tiefa',
- 'huai´an',
- 'leiyang',
- 'zalantun',
- 'weihai',
- 'loudi',
- 'qingzhou',
- 'qidong',
- 'huaihua',
- 'luohe',
- 'chuzhou',
- 'kaiyuan',
- 'linqing',
- 'chaohu',
- 'laohekou',
- 'dujiangyan',
- 'zhumadian',
- 'linchuan',
- 'jiaonan',
- 'sanmenxia',
- 'heyuan',
- 'manzhouli',
- 'lhasa',
- 'lianyuan',
- 'kuytun',
- 'puqi',
- 'hongjiang',
- 'qinzhou',
- 'renqiu',
- 'yuyao',
- 'guigang',
- 'kaili',
- 'yan´an',
- 'beihai',
- 'xuangzhou',
- 'quzhou',
- 'yong´an',
- 'zixing',
- 'liyang',
- 'yizheng',
- 'yumen',
- 'liling',
- 'yuncheng',
- 'shanwei',
- 'cixi',
- 'yuanjiang',
- 'bozhou',
- 'jinchang',
- 'fuan',
- 'suqian',
- 'shishou',
- 'hengshui',
- 'danjiangkou',
- 'fujin',
- 'sanya',
- 'guangshui',
- 'huangshan',
- 'xingcheng',
- 'zhucheng',
- 'kunshan',
- 'haining',
- 'pingliang',
- 'fuqing',
- 'xinzhou',
- 'jieyang',
- 'zhangjiagang',
- 'tong xian',
- 'yaan',
- 'emeishan',
- 'enshi',
- 'bose',
- 'yuzhou',
- 'tumen',
- 'putian',
- 'linhai',
- 'shaowu',
- 'junan',
- 'huaying',
- 'pingyi',
- 'huangyan'
- ]
- brazil_tail_character_cut = [
- 'industriais ltda',
- 'brasil indstria',
- 'e comercializacao',
- 'brasil ltda',
- 'industria',
- 'eireli',
- 'cia ltda',
- 'ind e com',
- 'brasil ltda epp',
- 'importacao',
- 'e comercio',
- 'comercio',
- # 'sa',
- 'do brasi',
- 'brasil sa',
- 'limitada',
- 'ltda me',
- 'ltda epp',
- 'ltda'
- ]
- brazil_tail_character_remove = [
- 'sa',
- 'ltda',
- 'casa'
- ]
- def get_clean_eng_ent_name(eng_name: str) -> str or None:
- if eng_name:
- # eng_name = eng_name.lower()
- eng_name = eng_name.lower().replace(' ', '')
- for char in full_width_character:
- eng_name = re.sub(re.escape(char), '', eng_name)
- for char in half_width_character:
- eng_name = re.sub(re.escape(char), '', eng_name)
- return eng_name
- else:
- return ''
- def remove_tail_char(eng_name: str) -> str or None:
- if eng_name:
- for char in tail_character:
- if eng_name.endswith(char):
- return eng_name[:-len(char)]
- return eng_name
- else:
- return ''
- @udf(returnType=BooleanType())
- def filter_china_ent(name_abb: str) -> bool:
- if name_abb:
- for char in chian_ent_label:
- if char in name_abb:
- return True
- return False
- def cut_tail_char_brazil(eng_name: str) -> str or None:
- if eng_name:
- for tail in brazil_tail_character_cut:
- pattern = re.compile(f'{tail}\s*', flags=re.IGNORECASE)
- match = re.search(pattern, eng_name)
- if match:
- ent_name_cut = eng_name[:match.start()].strip()
- if len(ent_name_cut) > 5:
- return ent_name_cut
- else:
- return eng_name
- return eng_name
- return ''
- def remove_punctuation(eng_name: str) -> str or None:
- if eng_name:
- eng_name = eng_name.lower()
- for char in full_width_character:
- eng_name = re.sub(re.escape(char), '', eng_name)
- for char in half_width_character:
- eng_name = re.sub(re.escape(char), '', eng_name)
- return eng_name
- else:
- return ''
- def remove_tail_char_brazil(eng_name: str) -> str or None:
- if eng_name:
- for char in brazil_tail_character_remove:
- if eng_name.endswith(char):
- return eng_name[:-len(char)].replace(' ', '')
- return eng_name.replace(' ', '')
- else:
- return ''
- if __name__ == '__main__':
- a = 'ABC ltda epp industriais ltdaltda me'
- print(remove_tail_char_brazil(a))
|