spark_eng_ent_name_clean.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. #!/usr/bin/env /usr/bin/python3
  2. # -*- coding:utf-8 -*-
  3. import json
  4. import re
  5. from typing import List
  6. from pyspark.sql.functions import udf
  7. from pyspark.sql.types import *
  8. full_width_character = ['.',
  9. ',',
  10. '-',
  11. '(',
  12. ')',
  13. '@',
  14. '?',
  15. '‘',
  16. '’',
  17. '“',
  18. '”',
  19. '`',
  20. '#',
  21. '+',
  22. '!',
  23. '$',
  24. '|',
  25. ':',
  26. '/',
  27. ';',
  28. '*',
  29. '《',
  30. '》',
  31. '<',
  32. '>',
  33. '`',
  34. '#',
  35. '+',
  36. '!',
  37. '$',
  38. '|',
  39. ':',
  40. '/',
  41. ';',
  42. '*',
  43. '《',
  44. '》',
  45. '<',
  46. '>',
  47. '%',
  48. '^',
  49. '&',
  50. '_',
  51. '[',
  52. ']',
  53. '{',
  54. '}',
  55. '\\',
  56. '~',
  57. '=',
  58. "'",
  59. '±',
  60. '°',
  61. '«',
  62. '»',
  63. 'µ',
  64. '¶',
  65. '·',
  66. '€',
  67. '£',
  68. '¥',
  69. '¢',
  70. '×',
  71. '÷',
  72. '±',
  73. '¬',
  74. '…',
  75. '→',
  76. '←',
  77. '↑',
  78. '↓',
  79. '↔',
  80. '⇒',
  81. '⇐',
  82. '≈',
  83. '≠',
  84. '≤',
  85. '≥'
  86. ]
  87. half_width_character = [
  88. '.',
  89. ',',
  90. '-',
  91. '(',
  92. ')',
  93. '@',
  94. '?',
  95. "'",
  96. "'",
  97. '"',
  98. '"',
  99. ''',
  100. '#',
  101. '+',
  102. '!',
  103. '$',
  104. '|',
  105. ':',
  106. '/',
  107. ';',
  108. '*',
  109. '<',
  110. '>',
  111. "'",
  112. '#',
  113. '+',
  114. '!',
  115. '$',
  116. '|',
  117. ':',
  118. '/',
  119. ';',
  120. '*',
  121. '<',
  122. '>',
  123. '%',
  124. '^',
  125. '&',
  126. '_',
  127. '[',
  128. ']',
  129. '{',
  130. '}',
  131. '\',
  132. '~',
  133. '=',
  134. "'",
  135. '±',
  136. '°',
  137. '«',
  138. '»',
  139. 'µ',
  140. '¶',
  141. '·',
  142. '€',
  143. '£',
  144. '¥',
  145. '¢',
  146. '×',
  147. '÷',
  148. '±',
  149. '¬',
  150. '…',
  151. '→',
  152. '←',
  153. '↑',
  154. '↓',
  155. '↔',
  156. '⇒',
  157. '⇐',
  158. '≈',
  159. '≠',
  160. '≤',
  161. '≥'
  162. ]
  163. tail_character = ['groupcompanylimited',
  164. 'limitedpartnership',
  165. 'corporationlimited',
  166. 'researchinstitute',
  167. 'liabilitycompany',
  168. 'limitedcompany',
  169. 'companylimited',
  170. 'youxiangongsi',
  171. 'incorporated',
  172. 'shanghaiinc',
  173. 'corporation',
  174. 'groupcoltd',
  175. 'companyltd',
  176. 'shlimited',
  177. 'colimited',
  178. 'groupltd',
  179. 'chinaltd',
  180. 'chinainc',
  181. 'factory',
  182. 'corpltd',
  183. 'company',
  184. 'ptyltd',
  185. 'agency',
  186. 'office',
  187. 'center',
  188. 'coltd',
  189. 'coinc',
  190. 'c0ltd',
  191. 'colt',
  192. 'corp',
  193. 'llc',
  194. 'ltd',
  195. 'co',
  196. ]
  197. chian_ent_label = [
  198. 'shanghai',
  199. 'peking',
  200. 'chongqing',
  201. 'tianjin',
  202. 'wuhan',
  203. 'harbin',
  204. 'shenyang',
  205. 'guangzhou',
  206. 'chengdu',
  207. 'nanjing]',
  208. 'changchun',
  209. 'xian',
  210. 'dalian',
  211. 'qingdao',
  212. 'jinan',
  213. 'hangzhou',
  214. 'zhengzhou',
  215. 'shijiazhuang',
  216. 'taiyuan',
  217. 'kunming',
  218. 'changsha',
  219. 'nanchang',
  220. 'fuzhou',
  221. 'lanzhou',
  222. 'guiyang',
  223. 'ningbo',
  224. 'hefei',
  225. 'anshan',
  226. 'fushun',
  227. 'nanning',
  228. 'zibo',
  229. 'qiqihar',
  230. 'jilin',
  231. 'tangshan',
  232. 'baotou',
  233. 'shenzhen',
  234. 'hohhot',
  235. 'handan',
  236. 'wuxi',
  237. 'xuzhou',
  238. 'datong',
  239. 'yichun',
  240. 'benxi',
  241. 'luoyang',
  242. 'suzhou',
  243. 'xining',
  244. 'huainan',
  245. 'jixi',
  246. 'daqing',
  247. 'fuxin',
  248. 'xiamen',
  249. 'liuzhou',
  250. 'shantou',
  251. 'jinzhou',
  252. 'mudanjiang',
  253. 'yinchuan',
  254. 'changzhou',
  255. 'zhangjiakou',
  256. 'dandong',
  257. 'hegang',
  258. 'kaifeng',
  259. 'jiamusi',
  260. 'liaoyang',
  261. 'hengyang',
  262. 'baoding',
  263. 'hunjiang',
  264. 'xinxiang',
  265. 'huangshi',
  266. 'haikou',
  267. 'yantai',
  268. 'bengbu',
  269. 'xiangtan',
  270. 'weifang',
  271. 'wuhu',
  272. 'pingxiang',
  273. 'yingkou',
  274. 'anyang',
  275. 'panzhihua',
  276. 'pingdingshan',
  277. 'xiangfan',
  278. 'zhuzhou',
  279. 'jiaozuo',
  280. 'wenzhou',
  281. 'zhangjiang',
  282. 'zigong',
  283. 'shuangyashan',
  284. 'zaozhuang',
  285. 'yakeshi',
  286. 'yichang',
  287. 'zhenjiang',
  288. 'huaibei',
  289. 'qinhuangdao',
  290. 'guilin',
  291. 'liupanshui',
  292. 'panjin',
  293. 'yangquan',
  294. 'jinxi',
  295. 'liaoyuan',
  296. 'lianyungang',
  297. 'xianyang',
  298. 'tai´an',
  299. 'chifeng',
  300. 'shaoguan',
  301. 'nantong',
  302. 'leshan',
  303. 'baoji',
  304. 'linyi',
  305. 'tonghua',
  306. 'siping',
  307. 'changzhi',
  308. 'tengzhou',
  309. 'chaozhou',
  310. 'yangzhou',
  311. 'dongwan',
  312. 'ma´anshan',
  313. 'foshan',
  314. 'yueyang',
  315. 'xingtai',
  316. 'changde',
  317. 'shihezi',
  318. 'yancheng',
  319. 'jiujiang',
  320. 'dongying',
  321. 'shashi',
  322. 'xintai',
  323. 'jingdezhen',
  324. 'tongchuan',
  325. 'zhongshan',
  326. 'shiyan',
  327. 'tieli',
  328. 'jining',
  329. 'wuhai',
  330. 'mianyang',
  331. 'luzhou',
  332. 'zunyi',
  333. 'shizuishan',
  334. 'neijiang',
  335. 'tongliao',
  336. 'tieling',
  337. 'wafangdian',
  338. 'anqing',
  339. 'shaoyang',
  340. 'laiwu',
  341. 'chengde',
  342. 'tianshui',
  343. 'nanyang',
  344. 'cangzhou',
  345. 'yibin',
  346. 'huaiyin',
  347. 'dunhua',
  348. 'yanji',
  349. 'jiangmen',
  350. 'tongling',
  351. 'suihua',
  352. 'gongziling',
  353. 'xiantao',
  354. 'chaoyang',
  355. 'ganzhou',
  356. 'huzhou',
  357. 'baicheng',
  358. 'shangzi',
  359. 'yangjiang',
  360. 'qitaihe',
  361. 'gejiu',
  362. 'jiangyin',
  363. 'hebi',
  364. 'jiaxing',
  365. 'wuzhou',
  366. 'meihekou',
  367. 'xuchang',
  368. 'liaocheng',
  369. 'haicheng',
  370. 'qianjiang',
  371. 'baiyin',
  372. 'bei´an',
  373. 'yixing',
  374. 'laizhou',
  375. 'qaramay',
  376. 'acheng',
  377. 'dezhou',
  378. 'nanping',
  379. 'zhaoqing',
  380. 'beipiao',
  381. 'fengcheng',
  382. 'fuyu',
  383. 'xinyang',
  384. 'dongtai',
  385. 'yuci',
  386. 'honghu',
  387. 'ezhou',
  388. 'heze',
  389. 'daxian',
  390. 'linfen',
  391. 'tianmen',
  392. 'yiyang',
  393. 'quanzhou',
  394. 'rizhao',
  395. 'deyang',
  396. 'guangyuan',
  397. 'changshu',
  398. 'zhangzhou',
  399. 'hailar',
  400. 'nanchong',
  401. 'jiutai',
  402. 'zhaodong',
  403. 'shaoxing',
  404. 'fuyang',
  405. 'maoming',
  406. 'qujing',
  407. 'ghulja',
  408. 'jiaohe',
  409. 'puyang',
  410. 'huadian',
  411. 'jiangyou',
  412. 'qashqar',
  413. 'anshun',
  414. 'fuling',
  415. 'xinyu',
  416. 'hanzhong',
  417. 'danyang',
  418. 'chenzhou',
  419. 'xiaogan',
  420. 'shangqiu',
  421. 'zhuhai',
  422. 'qingyuan',
  423. 'aqsu',
  424. 'xiaoshan',
  425. 'zaoyang',
  426. 'xinghua',
  427. 'hami',
  428. 'huizhou',
  429. 'jinmen',
  430. 'sanming',
  431. 'ulanhot',
  432. 'korla',
  433. 'wanxian',
  434. 'ruian',
  435. 'zhoushan',
  436. 'liangcheng',
  437. 'jiaozhou',
  438. 'taizhou',
  439. 'taonan',
  440. 'pingdu',
  441. 'ji´an',
  442. 'longkou',
  443. 'langfang',
  444. 'zhoukou',
  445. 'suining',
  446. 'yulin',
  447. 'jinhua',
  448. 'liu´an',
  449. 'shuangcheng',
  450. 'suizhou',
  451. 'ankang',
  452. 'weinan',
  453. 'longjing',
  454. 'daan',
  455. 'lengshuijiang',
  456. 'laiyang',
  457. 'xianning',
  458. 'dali',
  459. 'anda',
  460. 'jincheng',
  461. 'longyan',
  462. 'xichang',
  463. 'wendeng',
  464. 'hailun',
  465. 'binzhou',
  466. 'linhe',
  467. 'wuwei',
  468. 'duyun',
  469. 'mishan',
  470. 'shangrao',
  471. 'changji',
  472. 'meixian',
  473. 'yushu',
  474. 'tiefa',
  475. 'huai´an',
  476. 'leiyang',
  477. 'zalantun',
  478. 'weihai',
  479. 'loudi',
  480. 'qingzhou',
  481. 'qidong',
  482. 'huaihua',
  483. 'luohe',
  484. 'chuzhou',
  485. 'kaiyuan',
  486. 'linqing',
  487. 'chaohu',
  488. 'laohekou',
  489. 'dujiangyan',
  490. 'zhumadian',
  491. 'linchuan',
  492. 'jiaonan',
  493. 'sanmenxia',
  494. 'heyuan',
  495. 'manzhouli',
  496. 'lhasa',
  497. 'lianyuan',
  498. 'kuytun',
  499. 'puqi',
  500. 'hongjiang',
  501. 'qinzhou',
  502. 'renqiu',
  503. 'yuyao',
  504. 'guigang',
  505. 'kaili',
  506. 'yan´an',
  507. 'beihai',
  508. 'xuangzhou',
  509. 'quzhou',
  510. 'yong´an',
  511. 'zixing',
  512. 'liyang',
  513. 'yizheng',
  514. 'yumen',
  515. 'liling',
  516. 'yuncheng',
  517. 'shanwei',
  518. 'cixi',
  519. 'yuanjiang',
  520. 'bozhou',
  521. 'jinchang',
  522. 'fuan',
  523. 'suqian',
  524. 'shishou',
  525. 'hengshui',
  526. 'danjiangkou',
  527. 'fujin',
  528. 'sanya',
  529. 'guangshui',
  530. 'huangshan',
  531. 'xingcheng',
  532. 'zhucheng',
  533. 'kunshan',
  534. 'haining',
  535. 'pingliang',
  536. 'fuqing',
  537. 'xinzhou',
  538. 'jieyang',
  539. 'zhangjiagang',
  540. 'tong xian',
  541. 'yaan',
  542. 'emeishan',
  543. 'enshi',
  544. 'bose',
  545. 'yuzhou',
  546. 'tumen',
  547. 'putian',
  548. 'linhai',
  549. 'shaowu',
  550. 'junan',
  551. 'huaying',
  552. 'pingyi',
  553. 'huangyan'
  554. ]
  555. brazil_tail_character_cut = [
  556. 'industriais ltda',
  557. 'brasil indstria',
  558. 'e comercializacao',
  559. 'brasil ltda',
  560. 'industria',
  561. 'eireli',
  562. 'cia ltda',
  563. 'ind e com',
  564. 'brasil ltda epp',
  565. 'importacao',
  566. 'e comercio',
  567. 'comercio',
  568. # 'sa',
  569. 'do brasi',
  570. 'brasil sa',
  571. 'limitada',
  572. 'ltda me',
  573. 'ltda epp',
  574. 'ltda'
  575. ]
  576. brazil_tail_character_remove = [
  577. 'sa',
  578. 'ltda',
  579. 'casa'
  580. ]
  581. def get_clean_eng_ent_name(eng_name: str) -> str or None:
  582. if eng_name:
  583. # eng_name = eng_name.lower()
  584. eng_name = eng_name.lower().replace(' ', '')
  585. for char in full_width_character:
  586. eng_name = re.sub(re.escape(char), '', eng_name)
  587. for char in half_width_character:
  588. eng_name = re.sub(re.escape(char), '', eng_name)
  589. return eng_name
  590. else:
  591. return ''
  592. def remove_tail_char(eng_name: str) -> str or None:
  593. if eng_name:
  594. for char in tail_character:
  595. if eng_name.endswith(char):
  596. return eng_name[:-len(char)]
  597. return eng_name
  598. else:
  599. return ''
  600. @udf(returnType=BooleanType())
  601. def filter_china_ent(name_abb: str) -> bool:
  602. if name_abb:
  603. for char in chian_ent_label:
  604. if char in name_abb:
  605. return True
  606. return False
  607. def cut_tail_char_brazil(eng_name: str) -> str or None:
  608. if eng_name:
  609. for tail in brazil_tail_character_cut:
  610. pattern = re.compile(f'{tail}\s*', flags=re.IGNORECASE)
  611. match = re.search(pattern, eng_name)
  612. if match:
  613. ent_name_cut = eng_name[:match.start()].strip()
  614. if len(ent_name_cut) > 5:
  615. return ent_name_cut
  616. else:
  617. return eng_name
  618. return eng_name
  619. return ''
  620. def remove_punctuation(eng_name: str) -> str or None:
  621. if eng_name:
  622. eng_name = eng_name.lower()
  623. for char in full_width_character:
  624. eng_name = re.sub(re.escape(char), '', eng_name)
  625. for char in half_width_character:
  626. eng_name = re.sub(re.escape(char), '', eng_name)
  627. return eng_name
  628. else:
  629. return ''
  630. def remove_tail_char_brazil(eng_name: str) -> str or None:
  631. if eng_name:
  632. for char in brazil_tail_character_remove:
  633. if eng_name.endswith(char):
  634. return eng_name[:-len(char)].replace(' ', '')
  635. return eng_name.replace(' ', '')
  636. else:
  637. return ''
  638. if __name__ == '__main__':
  639. a = 'ABC ltda epp industriais ltdaltda me'
  640. print(remove_tail_char_brazil(a))