spark_eng_ent_ctstel_clean.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. import codecs
  2. import re
  3. import json
  4. from pyspark.sql.functions import udf
  5. from pyspark.sql.types import ArrayType, StringType
  6. # 科学计数法转数字
  7. scientific_pattern = r'([0-9]*\.?[0-9]+)[eE]([-+]?[0-9]+)'
  8. def scientific_to_number(input_str):
  9. if input_str:
  10. match = re.match(scientific_pattern, input_str)
  11. if match:
  12. base_number = float(match.group(1))
  13. exponent = int(match.group(2))
  14. result = base_number * (10 ** exponent)
  15. return str(int(result))
  16. else:
  17. return input_str
  18. return None
  19. pattern_space = r'(?<!\d)\s+|\s+(?!\d)'
  20. # pattern_keep_space = r'(\d)\s+(\d)'
  21. # pattern_remove_space = r'([^\d])\s+([^\d])'
  22. # 判断电话分隔符,如果特定分隔符前后都是7位,则用@@分隔,方便后续炸开
  23. def judge_delimiter(tel_str):
  24. if tel_str:
  25. # 正则判断空格,只保留数字之前的空格
  26. tel_str=re.sub(pattern_space, '', tel_str)
  27. # 正则拆分
  28. parts = re.split(r'[/,\s@&;]+', tel_str)
  29. # 用于存储处理后的字符串
  30. new_parts = []
  31. # 遍历分割后的字符串列表
  32. for i in range(len(parts)):
  33. # 检查当前部分是否为空,如果是则跳过
  34. if not parts[i]:
  35. continue
  36. # 检查当前部分和下一个部分的长度是否都大于等于6
  37. if i < len(parts) - 1 and len(parts[i]) >= 6 and len(parts[i + 1]) >= 6:
  38. # 如果是,则将当前部分和下一个部分用@@连接
  39. new_parts.append(parts[i] + '@@')
  40. else:
  41. # 如果不是,则添加当前部分
  42. new_parts.append(parts[i]+' ')
  43. # 将处理后的字符串部分重新组合成一个字符串
  44. return ''.join(new_parts)
  45. return None
  46. if __name__ == '__main__1':
  47. test_case_list = [
  48. '666666 7777777',
  49. '262-255- 7177 // 273308256',
  50. 'abc 123 456 def' ,
  51. '',
  52. None
  53. ]
  54. for str_tel in test_case_list:
  55. print(f'tel: {str_tel} ---->{judge_delimiter(str_tel)}')
  56. # 判断电话或传真位数
  57. def judge_tel_length(str):
  58. if str:
  59. length_str = re.sub(r'[^\d]', '', str)
  60. if len(length_str) < 6:
  61. return None
  62. else:
  63. return str
  64. return None
  65. # 删除字符串首位特殊符号
  66. remove_chars = ' :/-;?@#>.,*'
  67. def clean_headtail(str):
  68. if str:
  69. remove_str = str.strip(remove_chars)
  70. str = remove_str.lstrip(')').rstrip('(')
  71. return str
  72. return None
  73. if __name__ == '__main__1':
  74. test_case_list = [
  75. '123-243',
  76. '(123345)',
  77. '(010)1(2)3345',
  78. '(010)1(2)334(5)',
  79. '(010)123345)',
  80. '(010)123345(',
  81. '(010123345',
  82. ')010123345',
  83. '472601(',
  84. '',
  85. None,
  86. '913207067724649993*'
  87. ]
  88. for str_tel in test_case_list:
  89. print(f':{str_tel}---->{clean_headtail(str_tel)}')
  90. tel_bad_list=[
  91. ':',
  92. ';',
  93. ',',
  94. '.',
  95. '?',
  96. '//',
  97. '()',
  98. '( )',
  99. '�'
  100. ]
  101. def col_tel_clean(tel_str):
  102. if tel_str:
  103. if 'e+' in tel_str.lower():
  104. tel_str = scientific_to_number(tel_str)
  105. cleaned_zero = re.sub(r'\.0+$', '', tel_str)
  106. for bad in tel_bad_list:
  107. cleaned_zero = cleaned_zero.replace(bad, ' ')
  108. clean_letter = re.sub(r'[a-zA-Z]', '', cleaned_zero)
  109. clean_headtail = clean_letter.lstrip('/-;?@#>').rstrip('/-;?@#>')
  110. clean_blank = re.sub(r'\s+', ' ', clean_headtail).strip()
  111. tel_str = judge_delimiter(clean_blank)
  112. if tel_str:
  113. # 判断位数
  114. length_str = re.sub(r'[^\d]', '', tel_str)
  115. if len(length_str) < 6:
  116. return None
  117. else:
  118. return tel_str
  119. return None
  120. return None
  121. if __name__ == '__main__1':
  122. test_case_list = [
  123. 'Fax: +1 780 468 9165',
  124. 'FAX.9545852544',
  125. 'FAX/5618446131',
  126. 'Fax : 833.338.8901',
  127. 'Fax/: 833.338.8901',
  128. '(615) 316-5100 // FAX (615) 31',
  129. 'TEL: 507-69828001',
  130. '6914 1002 TAX ID:200514854D',
  131. 'Fax No: +86 (0) 527.84495888',
  132. 'RUT:76.631.726-K',
  133. 'FAX. 41 32 392 51 07B>',
  134. 'FAX9545852544/46',
  135. 'FAXSIN FAX',
  136. 'LONGROnO 3871232',
  137. '6910500.0000',
  138. '6910500.0',
  139. '3.203177e+11',
  140. '3.19213916545e+11',
  141. '1230000',
  142. '5397-4880,5397-1333',
  143. '',
  144. None
  145. ]
  146. for str_tel in test_case_list:
  147. print(f'tel: {str_tel} ---->{col_tel_clean(str_tel)}')
  148. col_bad_email=[
  149. '@','*','-','.',','
  150. ]
  151. def col_email_clean(email):
  152. if email:
  153. email = email.lower().replace('@@', '@').replace(',.', '.').replace('.,', '.')
  154. for badstr in col_bad_email:
  155. if email.startswith(badstr)|email.endswith(badstr):
  156. email=email.replace(badstr,'')
  157. if '.' not in email:
  158. return None
  159. if email.count('@') == 1:
  160. email = email.replace(',', '.')
  161. # 标准邮箱
  162. if re.search(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]*', email):
  163. return email
  164. if email.count('@') >= 2:
  165. # CONTADOR@JANSENANDRE@HOTMAIL.COM
  166. if re.search( r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+@[a-zA-Z.]+', email):
  167. return None
  168. # 标准邮箱
  169. email_pattern = re.compile(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]{1,3}')
  170. email = email_pattern.findall(email)
  171. if email:
  172. return ','.join(email)
  173. return None
  174. if __name__ == '__main__1':
  175. test_case_list = [
  176. 'info@gcrcompact@gcrieber.com',
  177. 'HITECH@MOLDER.COM.HK, HITECH@M',
  178. 'siki.huang@byd.com / betty.qiu@b',
  179. 'sales@dayusainc.com <sales@day',
  180. 'cora@38f.net,fini@39f.net',
  181. 'italmaq.amm@gmail.com',
  182. 'info@papeleradelpacifico.com,',
  183. 'WWW,IMPERIO_CARGO@GMAIL.COM',
  184. 'SUCDEN@SUCDEN.COM. AMERICAS@SUCD',
  185. 'COMEX@TELMAC.COM.BR/ COMEX4@TE',
  186. 'abby@jinshen.cnabby@jinshenmc',
  187. '',
  188. None
  189. ]
  190. for str_tel in test_case_list:
  191. print(f'tel: {str_tel} ---->{col_email_clean(str_tel)}')
  192. #匹配含电话号的传真号码即同时含tel|ph和FAX 取出fax后面的传真号码
  193. tel_fax_pattern1 = re.compile(r'(ph|tel)(.*)[(]?fax[)]?(.*)', re.IGNORECASE)
  194. tel_fax_pattern2 = re.compile(r'[^ph|tel]tel[e]?[\s]?[&(]?fax[\s:.)]?[n]?[o]?[\s:.]?', re.IGNORECASE)
  195. #匹配只有fax的传真号码
  196. fax_pattern = re.compile(r'(fax)', re.IGNORECASE)
  197. # 印度jksdh提取fax
  198. def ind_getfax_jksdh(tel_str):
  199. if tel_str:
  200. tel_fax_match1 = re.search(tel_fax_pattern1, tel_str)
  201. tel_fax_match2 = re.search(tel_fax_pattern2, tel_str)
  202. fax_match1 = re.search(fax_pattern, tel_str)
  203. # 既有电话又有传真时或取传真
  204. if tel_fax_match1:
  205. # 如果tel和fax连在一起,视为传真把这部分替换为@@
  206. if tel_fax_match2:
  207. split_fax = re.sub(tel_fax_pattern2, '@@', tel_str)
  208. # 将其余字母替换成空
  209. split_fax_cleanletter = re.sub(r'[a-zA-Z]', '', split_fax)
  210. split_fax_enter = re.sub(r'\s+', ' ', split_fax_cleanletter)
  211. return split_fax_enter.strip(remove_chars)
  212. get_afterfax = tel_fax_match1.group(3)
  213. clean_afterfax = re.sub(r'[a-zA-Z]', '', get_afterfax)
  214. return clean_afterfax.strip(remove_chars)
  215. # 只有fax 传真
  216. if fax_match1:
  217. split_fax = re.sub(fax_pattern, '@@', tel_str)
  218. split_fax_cleanletter = re.sub(r'[a-zA-Z]', '', split_fax)
  219. return split_fax_cleanletter.strip(remove_chars)
  220. return None
  221. def ind_fax_jksdh_clean(jksdh):
  222. fax = judge_delimiter(jksdh)
  223. if fax:
  224. for bad in tel_bad_list:
  225. fax = fax.replace(bad, '')
  226. return fax
  227. return None
  228. if __name__ == '__main__1':
  229. test_case_list = [
  230. '011-23557208,telefax0129-2279612 to 615',
  231. '02-65111032/020-65111033 tel fax',
  232. '033-2358-7784, 03323587789(telefax)',
  233. '431222 EXTEN:201/431821(D)/FAX NO.091-0422-431672',
  234. 'PH 080-91133444 FAX 080-91133502',
  235. 'PH. 011-514 6164, 514 6165, 540 0984, FAX. 011- 549 2977',
  236. 'TEL:91-22-4921900-05 FAX:91-22-4939284,4950594',
  237. 'TELE FAX-91-22-27681365/27686787',
  238. 'TELE:66011640, FAX:26057125',
  239. 'TELEFAX+914428143552/+914443502454',
  240. 'TELEFAX-3432782/02223423810/02223432782',
  241. 'TELEFAX-4074329',
  242. 'Tel : (044) 823 2117 Fax (044) 823 4411',
  243. 'Tel : 080-3349348 / Fax : 080-3348607',
  244. 'Tel : 080-3349348; Fax : 080-3348607',
  245. 'Tel : 22-8731998 Fax : 022-8711911',
  246. 'Tel: 344 3644, Fax no: 342 9023',
  247. 'Telefax 4930742',
  248. '42011184,42011135,42157331/TELEFAX NO.28584954/MOBILE 9840104275',
  249. '07232-44134/44247fax45430',
  250. '011-23557208,telefax0129-2279612 to 615',
  251. '40460655 tel fax no 21021042',
  252. '022-25890222 FAX NO.022-25890411',
  253. '28271933 FAX NO. 28302531/32',
  254. '',
  255. None
  256. ]
  257. for str_tel in test_case_list:
  258. print(f'str: {str_tel} ---->{ind_getfax_jksdh(str_tel)}')
  259. # 印度jksdh提取phone
  260. def ind_gettel_jksdh(tel_str):
  261. if tel_str:
  262. tel_fax_match1 = re.search(tel_fax_pattern1, tel_str)
  263. # 既有电话又有传真 取电话
  264. if tel_fax_match1:
  265. get_aftertel = tel_fax_match1.group(2)
  266. clean_aftertel = re.sub(r'[a-zA-Z]', '', get_aftertel)
  267. return clean_aftertel.strip(remove_chars)
  268. # jksdh不含fax
  269. clean_letter = re.sub(r'[a-zA-Z]', ' ', tel_str)
  270. clean_enter = re.sub(r'\s+', ' ', clean_letter)
  271. return clean_enter.strip(remove_chars)
  272. return None
  273. def ind_tel_jksdh_clean(jksdh):
  274. tel_str = ind_gettel_jksdh(jksdh)
  275. tel = judge_delimiter(tel_str)
  276. if tel:
  277. for bad in tel_bad_list:
  278. tel = tel.replace(bad, '')
  279. return tel
  280. return None
  281. if __name__ == '__main__1':
  282. test_case_list = [
  283. '(0161) 662154, 660637 &amp; 664538',
  284. '',
  285. None
  286. ]
  287. for str_tel in test_case_list:
  288. print(f'tel: {str_tel} ---->{ind_tel_jksdh_clean(str_tel)}')
  289. if __name__ == '__main__':
  290. test_case_list = [
  291. '011-23557208,telefax0129-2279612 to 615',
  292. '431222 EXTEN:201/431821(D)/FAX NO.091-0422-431672',
  293. 'PH 080-91133444 FAX 080-91133502',
  294. 'PH. 011-514 6164, 514 6165, 540 0984, FAX. 011- 549 2977',
  295. 'TEL:91-22-4921900-05 FAX:91-22-4939284,4950594',
  296. 'TELE FAX-91-22-27681365/27686787',
  297. 'TELE:66011640, FAX:26057125',
  298. 'Tel : (044) 823 2117 Fax (044) 823 4411',
  299. 'Tel : 080-3349348 / Fax : 080-3348607',
  300. 'Tel : 080-3349348; Fax : 080-3348607',
  301. 'Tel : 22-8731998 Fax : 022-8711911',
  302. 'Tel: 344 3644, Fax no: 342 9023',
  303. '25594911 TO 916',
  304. '8012997/f-8626376',
  305. '',
  306. None
  307. ]
  308. for str_tel in test_case_list:
  309. print(f'tel: {str_tel} ---->{ind_tel_jksdh_clean(str_tel)}')
  310. def ind_fax_jkscz_clean(jksdh):
  311. if jksdh:
  312. clean_letter = re.sub(r'[a-zA-Z]', ' ', jksdh)
  313. clean_enter = re.sub(r'\s+', ' ', clean_letter)
  314. tel = judge_delimiter(clean_enter)
  315. if tel:
  316. for bad in tel_bad_list:
  317. tel = tel.replace(bad, '')
  318. return tel
  319. return None
  320. def pry_phone_clean(jksdh):
  321. if jksdh:
  322. clean_letter = re.sub(r'[a-zA-Z]', ' ', jksdh)
  323. clean_enter = re.sub(r'\s+', ' ', clean_letter)
  324. tel = judge_tel_length(clean_enter)
  325. if tel:
  326. for bad in tel_bad_list:
  327. tel = tel.replace(bad, '')
  328. return tel
  329. return None
  330. if __name__ == '__main__1':
  331. test_case_list = [
  332. '1234to2',
  333. '',
  334. None
  335. ]
  336. for str_tel in test_case_list:
  337. print(f'tel: {str_tel} ---->{pry_phone_clean(str_tel)}')
  338. month_dict = {
  339. 'JAN': '01',
  340. 'FEB': '02',
  341. 'MAR': '03',
  342. 'APR': '04',
  343. 'MAY': '05',
  344. 'JUN': '06',
  345. 'JUL': '07',
  346. 'AUG': '08',
  347. 'SEP': '09',
  348. 'OCT': '10',
  349. 'NOV': '11',
  350. 'DEC': '12'
  351. }