ent_clean_text.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. import codecs
  2. import re
  3. import json
  4. from pyspark.sql.functions import udf
  5. from pyspark.sql.types import ArrayType, StringType
  6. from dw_base.spark.udf.customs.common_clean import clean_company_name
  7. url_bad_list = [
  8. 'www.,'
  9. , 'www.'
  10. , '/web:'
  11. , 'http:////'
  12. , 'http:///'
  13. , 'http://'
  14. , 'https://'
  15. , 'web:'
  16. , 'ww�w.'
  17. , 'www�'
  18. , 'w�'
  19. ]
  20. china_url_suff_list = [
  21. '.com.cn',
  22. '.com',
  23. '.cn',
  24. '.org.cn'
  25. '.org',
  26. '.net',
  27. '.info',
  28. ]
  29. # 俄罗斯域名后缀
  30. russia_url_suff_list = [
  31. # --通用顶级域名
  32. '.group',
  33. '.hu',
  34. '.it',
  35. '.com.cy',
  36. '.com',
  37. '.net',
  38. '.org',
  39. '.int',
  40. '.edu',
  41. '.tech',
  42. '.group',
  43. '.eco',
  44. '.eu',
  45. '.info',
  46. '.company'
  47. # --俄罗斯域名
  48. '.рф',
  49. '.ru',
  50. '.su',
  51. '.by',
  52. '.biz',
  53. '.pro',
  54. '.coop',
  55. '.aero',
  56. '.museum',
  57. '.xyz',
  58. '.online',
  59. '.site'
  60. ]
  61. # 中国工商URL清洗
  62. def clean_url_china(url):
  63. if url is not None:
  64. url = url.lower()
  65. if url in ['ltd.', '.ltd.']:
  66. return None
  67. if url.endswith(',ltd.'):
  68. return None
  69. for bad in url_bad_list:
  70. url = url.replace(bad, '')
  71. for suffix in china_url_suff_list:
  72. if suffix in url:
  73. return url[:url.index(suffix)] + suffix
  74. return url
  75. # 俄罗斯URL清洗
  76. def clean_url_russia(url):
  77. if url is not None and url != '':
  78. url = url.lower()
  79. for bad in url_bad_list:
  80. url = url.replace(bad, '')
  81. if '.' not in url:
  82. return None
  83. for suffix in russia_url_suff_list:
  84. if suffix in url:
  85. return url[:url.index(suffix)] + suffix
  86. return url
  87. # 美国工商URL清洗
  88. def clean_url_america(url):
  89. if url:
  90. url = url.lower()
  91. for bad in url_bad_list:
  92. url = url.replace(bad, '')
  93. if ':' in url:
  94. # 分割URL以获取域名部分
  95. parts = url.split(':', 1)
  96. url = parts[0] # 只保留端口号前的域名部分
  97. # 再次检查URL中是否包含斜杠,如果是,则只保留斜杠前的部分
  98. if '/' in url:
  99. parts = url.split('/', 1)
  100. url = parts[0]
  101. if re.search(r'(\d+\.\d+\.\d+\.\d+)', url):
  102. return None
  103. return url
  104. return None
  105. # 通用网址清洗规则
  106. def clean_url_common(url):
  107. if url:
  108. url = url.lower()
  109. for bad in url_bad_list:
  110. url = url.replace(bad, '')
  111. if not url:
  112. return None
  113. if '/' in url:
  114. parts = url.split('/', 1)
  115. return parts[0]
  116. else:
  117. return url
  118. return None
  119. # 网址测试
  120. # if __name__ == '__main__':
  121. # test_case_list = [
  122. # 'https://www.ianshaw.biz/p/contact-management.php',
  123. # 'https://charnleyfertilisers.co.uk/',
  124. # 'https://nyulangone.org/doctors/1205925765/carol-dunetz?cid=syn_yext\u0026y_entity_id=1205925765-primary\u0026y_source=1_MjU0NTEyNzEtNDgzLWxvY2F0aW9uLndlYnNpdGU%3D',
  125. # 'https://www.carolleviandcompany.it/',
  126. # 'https://schrotthandel-heinen.de/',
  127. # 'http://201.149.15.54:88/',
  128. # 'http://190.107.176.73/~prodinwe/www2/inicio.html',
  129. # 'https://findadoctor.atlantichealth.org/provider/Joseph+C+Lugo/1140352?unified=lugo\u0026sort=networks%2Crelevance\u0026_ga=2.142101431.428278081.1637589591-505885973.1636636554\u0026_gac=1.36491028.1637590169.EAIaIQobChMImKmH3JKs9AIVl4TICh2yrwEXEAAYASAAEgKlDvD_BwE'
  130. #
  131. #
  132. # ]
  133. # for url in test_case_list:
  134. # print(f'url: {url} ----> {clean_url_america(url)}')
  135. # 国家工商URL清洗
  136. def clean_url(country, url):
  137. if country == 'China':
  138. return clean_url_china(url)
  139. if country == 'Russia':
  140. return clean_url_russia(url)
  141. if country == 'America':
  142. return clean_url_america(url)
  143. return None
  144. # 越南电话要替换成分隔符的字符串
  145. vietnam_tel_split_list = [
  146. 'faxno'
  147. , 'fax-'
  148. , '-fax'
  149. , 'fax.'
  150. , 'fax'
  151. , 'tele'
  152. ]
  153. vietnam_tel_bad_list = [
  154. 'f'
  155. , 'awelexports@gmailcom'
  156. , 'm-'
  157. , 'axno'
  158. , 'ax'
  159. , 'no'
  160. , '(ext'
  161. , 'linhkt'
  162. , '.'
  163. , 'nhnh3'
  164. ]
  165. reverse_str_list = [
  166. '.',
  167. '/'
  168. ]
  169. # 字符串反转输出
  170. def reverse_str(str):
  171. if str:
  172. for str1 in reverse_str_list:
  173. if str1 in str:
  174. parts = str.split(str1)
  175. # 倒序排列分割后的部分
  176. reversed_parts = parts[::-1]
  177. # 使用join方法将倒序后的部分重新组合成字符串
  178. str = '-'.join(reversed_parts)
  179. return str
  180. return None
  181. # 英文和空格替换成''
  182. def replace_english_and_space(str):
  183. result = re.sub(r'[a-zA-Z\s]', '', str)
  184. return result
  185. # 数组元素去重
  186. def array_remove_duplicates(str):
  187. if str:
  188. str_array = str.split(',')
  189. unique_str = list(set(str_array))
  190. return ','.join(unique_str)
  191. return None
  192. if __name__ == '__main__1':
  193. test_case_list = [
  194. '[]',
  195. '[91220101123911541QCHN]',
  196. '[, 91220101123911541QCHN]'
  197. ]
  198. for arraystr in test_case_list:
  199. print(f'tel: {arraystr} ----> {array_remove_duplicates(arraystr)}')
  200. company_name_pattern1 = r'(^[0-9]{2}\.[0-9]{3}\.[0-9]{3})(.*)' # 12.575.462 ALVARO PEREIRA DA SILVEIRA FILHO
  201. company_name_pattern2 = r'.+( [0-9]+)$' # HEBE DE ABREU VILELA CPF 027116806149
  202. # 公司名称清洗 去重前置xx-xxx-xxx
  203. def clean_brazil_company_name(name):
  204. if name:
  205. namepattern1_match = re.search(company_name_pattern1, name)
  206. if namepattern1_match:
  207. namepattern1 = namepattern1_match.group(2)
  208. return clean_company_name(namepattern1)
  209. namepattern2_match = re.search(company_name_pattern2, name)
  210. if namepattern2_match:
  211. namepattern2 = namepattern2_match.group(1)
  212. if len(namepattern2) > 8:
  213. return clean_company_name(name.replace(namepattern2, ''))
  214. return clean_company_name(name)
  215. else:
  216. return None
  217. # 土耳其 ,分隔电话,如果少于10位,则置空
  218. def phone_clean_turkey(phone):
  219. if phone:
  220. # 将输入字符串分割成数组
  221. phone_arr = phone.split(',')
  222. # 过滤数组元素,长度不等10的元素置空
  223. phone_arr_new = [str for str in phone_arr if len(str) == 10]
  224. # 将过滤后的数组重新组合成字符串,如果没有元素则返回空字符串
  225. phone_str = ','.join(phone_arr_new) if phone_arr_new else None
  226. return phone_str
  227. return None
  228. # 土耳其 ,分隔传真,9开头11位,置空;0开头11位,删除0;1位和12为置空
  229. def fax_clean_turkey(fax):
  230. if fax:
  231. fax_len = len(fax)
  232. if fax_len == 10:
  233. return fax
  234. elif fax_len == 11 and fax.startswith('0'):
  235. return fax[1:]
  236. return None
  237. if __name__ == '__main__1':
  238. test_case_list = [
  239. # turkey-phone-alltype
  240. '4443361',
  241. '2164708444',
  242. '2122772674,4',
  243. '2123518966,67',
  244. '4449911,4441311',
  245. '2126944565,4444080',
  246. '214511936,2125037861',
  247. '2123225997,2123228911',
  248. '2165274671,2162663626,4441158',
  249. '2163782062,2163782649,2163787830',
  250. '',
  251. None,
  252. # turkey-fax-alltype
  253. '021648847322',
  254. '02164884732',
  255. '92164884732',
  256. '2164884732',
  257. '0'
  258. ]
  259. for str in test_case_list:
  260. print(f'tel: {str} ----> {fax_clean_turkey(str)}')
  261. # 行业代码清洗
  262. pattern = r'\d{2}\.\d{2}\.\d{2}'
  263. def turkey_nicecode(nicecode):
  264. if nicecode:
  265. codes = re.findall(pattern, nicecode)
  266. result = ', '.join(codes)
  267. result = result.replace('.', '')
  268. return result
  269. return None
  270. if __name__ == '__main__1':
  271. test_case_list = [
  272. '["10.71.01-Taze pastane ürünleri imalatı (yaş pasta, kuru pasta, poğaça, kek, börek, pay, turta, waffles vb.)"]',
  273. '["15.12.07-Deri, kösele, karma deri ve diğer malzemelerden bavul, el çantası, cüzdan, okul çantası, evrak çantası, deriden sigaralık, deri ayakkabı bağı, kişisel bakım, dikiş, vb. amaçlı seyahat seti, vb. ürünlerin imalatı"]',
  274. '["07.29.06-Krom madenciliği"]',
  275. '["35.12.13-Elektrik enerjisinin iletimi (elektrik üretim kaynağından dağıtım sistemine aktaran iletim sistemlerinin işletilmesi)","42.22.02-Enerji santralleri inşaatı (hidroelektrik santrali, termik santral, nükleer enerji üretim santralleri vb.)","35.11.19-Elektrik enerjisi üretimi"]'
  276. ]
  277. # for str in test_case_list:
  278. # print(f'tel: {str} ----> {turkey_nicecode(str)}')
  279. email_pattern1 = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+@[a-zA-Z.]+' # CONTADOR@JANSENANDRE@HOTMAIL.COM
  280. email_pattern2 = r'.*@$' # XXXXXXXXX@XXXXX@ @@@@@@@@@@2 @@@@@@@@@@
  281. brazil_bad_email = [
  282. '@', '*', '-', '.', ','
  283. ]
  284. # 巴西邮箱清洗
  285. def email_clean_brazil(email):
  286. if email:
  287. email = email.lower().replace('@@', '@').replace(',.', '.').replace('.,', '.')
  288. for badstr in brazil_bad_email:
  289. if email.startswith(badstr) | email.endswith(badstr):
  290. return None
  291. if '.' not in email:
  292. return None
  293. if email == 'flr@flr.@bol.com.br':
  294. return None
  295. if email.count('@') == 1:
  296. email = email.replace(',', '.')
  297. if re.search(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]*', email):
  298. return email
  299. if re.search(email_pattern1, email):
  300. return None
  301. email_pattern = re.compile(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]*')
  302. emails = email_pattern.findall(email)
  303. return emails
  304. return None
  305. if __name__ == '__main__':
  306. test_case_list = [
  307. 'HUGO.SANSIL@GMAIL.COM E HUGO@SISTEMAFIEG.ORG.BR',
  308. "laaltenhofen@brturbo.com.br ou luialtenhofen@hotmail.com",
  309. "SANDRA_MMC@BOL.COM.BR NAIRMOTADIAS@HOTMAIL.COM",
  310. "fundesco@ig.com.br /e ou juliocesarcoelho@ig.com.br",
  311. "choco.mixgold@hotmail.com / ou elisangela.sena@gmail.com",
  312. "SALES@ZGC.COM / REPAIR@ZGC.COM / WWW.ZGC.COM",
  313. "veronica@beereayres.com.br veronicabeer@uol.com.br advocacia@beereayres.com.br",
  314. "emanoel@amazoniaim.aginaria@org.br",
  315. "emerson.pires@contabilidadepires@.com.br"
  316. , '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
  317. , '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2'
  318. , 'XXXXXXXXXXXXX@XXXXX@'
  319. , '@'
  320. , 'lcregina@terra.com.br+phcontabil@brturb'
  321. , 'lcregina@terra.com.br, phcontabil@brturb'
  322. , 'abc.abc@abcabc@brturb'
  323. , 'alguizardi@hotmail.com -Terceiros.10@hotmail.com'
  324. , 'JURIDICO@LSCONTABILIDADE,COM.BR'
  325. , 'LUCIANO.KLEIMAN@B4WASTE.COM.,BR'
  326. , 'flr@flr.@bol.com.br'
  327. , 'aurenirrodrigues@ig,com.br'
  328. , 'kallfotosdigital@hotmailcom'
  329. , 'jurandireleicao2020@gmail'
  330. , ',,CICERO.BONFIM@HOTMAIL.COM'
  331. , ',japsjcampos@ig.combr'
  332. , 'FERNANDO@TWINFORMATICA.COM.BR<FERNANDO@TWINFORMATICA.COM.BR>'
  333. ]
  334. for str in test_case_list:
  335. print(f'tel: {str} ----> {email_clean_brazil(str)}')
  336. def arr_str_to_str(str):
  337. # 检查输入字符串是否为空
  338. if str:
  339. str = str.replace('[]', '')
  340. if str:
  341. # 使用json.loads()解析JSON字符串,然后使用join将列表转换为字符串
  342. return ','.join(json.loads(str))
  343. # 如果输入为空,返回空字符串
  344. return None
  345. if __name__ == '__main__1':
  346. test_case_list = [
  347. '[]',
  348. '["accounting","financial services"]',
  349. '["staffing & recruiting"]',
  350. '["management consulting","business consulting & services"]',
  351. '',
  352. None
  353. ]
  354. for arraystr in test_case_list:
  355. print(f'tel: {arraystr} ----> {arr_str_to_str(arraystr)}')
  356. bad_tel_part1 = r'[^0-9+]'
  357. bad_tel_part2 = re.compile(r'^(.*?)([^\d]+)$') # (r'^(.*?)([a-zA-Z\-\(\) ]+)$')
  358. def clean_tel_apollo(str):
  359. if str:
  360. # str = str.lower()
  361. # for bad_tel_str in bad_tel_list:
  362. # str= str.replace(bad_tel_str,'')
  363. clean_str1 = re.sub(bad_tel_part1, ' ', str)
  364. str = ' '.join(clean_str1.split())
  365. bad_match = bad_tel_part2.search(str)
  366. if bad_match:
  367. str = bad_match.group(1).strip()
  368. else:
  369. str = str.strip()
  370. # 判断位数
  371. cleane_str2 = re.sub(r'[^\d]', '', str)
  372. if len(cleane_str2) < 7:
  373. return None
  374. else:
  375. return str
  376. return None
  377. if __name__ == '__main__1':
  378. test_case_list = [
  379. '+1-866-344-7857 ext. 311',
  380. '(678)826-BUY1',
  381. '(844)800-BULL',
  382. '+ (373) 68 488 807 MDA',
  383. '++420 606 075 787 (Po - Pá)',
  384. '+1 412-281-4100 ext 212',
  385. '',
  386. None
  387. ]
  388. for str_tel in test_case_list:
  389. print(f'tel: {str_tel} ----> {clean_tel_apollo(str_tel)}')
  390. type_url = {
  391. "author": "tw.com/",
  392. "facebook": "facebook.com/",
  393. "google": "google.com/",
  394. "google|twcamp": "tw.com/",
  395. "instagram": "instagram.com/",
  396. "linkedin": "linkedin.com/",
  397. "pinterest": "pinterest.com/",
  398. "serp|twgr": "tw.com/",
  399. "tfw": "tw.com/",
  400. "tfw&screen_name=ferrespanola&tw_p=followbutton": "tw.com/",
  401. "twitter": "twitter.com/",
  402. "youtube": "youtube.com/",
  403. "crunchbase": "crunchbase.com/",
  404. "angellist": "angel.co/"
  405. }
  406. bad_url_list = [
  407. 'https:', 'https://www', 'www'
  408. ]
  409. def socialmedia_url(socialtype, url):
  410. if not url:
  411. return None
  412. # 检查类别是否存在于字典中
  413. if socialtype in type_url:
  414. url_split = type_url[socialtype]
  415. url = url.lower()
  416. if url_split in url:
  417. url_clean = url.split(url_split)[-1].rstrip('/|#>+-.;?@}')
  418. if url_clean in bad_url_list:
  419. return None
  420. else:
  421. return url_clean
  422. url = url.lower().rstrip('/|#>+-.;?@}')
  423. if url in bad_url_list:
  424. return None
  425. else:
  426. return url
  427. if __name__ == '__main__1':
  428. test_case_list = [
  429. ("youtube", "https://youtube.com/user/BrotherCanadaEn"),
  430. ("facebook", "https://www.facebook.com/eastwesteng/"),
  431. ("google", "https://google.com/search?q=test"),
  432. ("author", "https://tw.com/SRAMroad?ref_src=twsrc"),
  433. ("tfw&screen_name=ferrespanola&tw_p=followbutton", "https://tw.com/search?q=test"),
  434. ("serp|twgr", "https://tw.com/search?q=test"),
  435. ("twitter", "https://twitter.com/#"),
  436. ("linkedin", "https://www.linkedin.com/in/meb-jsc/#"),
  437. ("instagram", "https://www.instagram.com/##############/"),
  438. ("facebook", "https://www.facebook.com/https://www.facebook.com/komlider38/"),
  439. ("pinterest", "https://www.pinterest.com/lampstore/https://www.pinterest.com/lampstore/"),
  440. ("linkedin",
  441. "https://www.linkedin.com/start/join?session_redirect=https://www.linkedin.com/company/swelect-energy-systems-ltd?trk=biz-companies-cym&source=D8E90337EA&trk=login_reg"),
  442. ("google", "https://twitter.com/search?q=test"),
  443. ("whatsapp", "919822025525"),
  444. ("nonexistent", "https://nonexistent.com/page"),
  445. ("", "919822025525"),
  446. ("twitter", "https://twitter.com/92342/3#4"),
  447. ("twitter", "https://twitter.com/@#dfw}kdn|"),
  448. ("twitter", "https://twitter.com/euroledwwwhttps:"),
  449. ("facebook", "https://facebook.com/alburoojrealestate/"),
  450. (None, ""),
  451. ("", None),
  452. (None, None)
  453. ]
  454. for socialtype, url in test_case_list:
  455. suffix = socialmedia_url(socialtype, url)
  456. print(f'category: {socialtype}, url: {url} ----> {suffix}')
  457. def hongkong_previous_name_clean(str):
  458. if str:
  459. if str.startswith('-- '):
  460. str = str[3:]
  461. else:
  462. str = str[12:]
  463. return str
  464. return None
  465. if __name__ == '__main__1':
  466. test_case_list = [
  467. '-- PACIFIC PRODUCTS LIMITED AUSTRALIAN PRODUCTS LIMITED',
  468. '03-MAY-2013 Fuente Union Import And Export Limited 福恩特聯合進出口有限公司'
  469. '',
  470. None
  471. ]
  472. for str_tel in test_case_list:
  473. print(f':{str_tel}---->{hongkong_previous_name_clean(str_tel)}')
  474. # 英国爬虫匹配股份占比
  475. sharepercent_pattern = re.compile(r'\["ownership-of-shares-(.+?)-percent')
  476. def uk_sharepercent(str):
  477. if str:
  478. sharepercent_match = re.search(sharepercent_pattern, str)
  479. if sharepercent_match:
  480. sharepercent = sharepercent_match.group(1)
  481. return sharepercent
  482. else:
  483. return None
  484. if __name__ == '__main__1':
  485. test_case_list = [
  486. '["ownership-of-shares-25-to-50-percent","voting-rights-25-to-50-percent"]',
  487. '["ownership-of-shares-more-than-25-percent-registered-overseas-entity"]',
  488. '',
  489. None
  490. ]
  491. for str_tel in test_case_list:
  492. print(f':{str_tel}---->{uk_sharepercent(str_tel)}')