| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386 |
- import codecs
- import re
- import json
- from pyspark.sql.functions import udf
- from pyspark.sql.types import ArrayType, StringType
- # 科学计数法转数字
- scientific_pattern = r'([0-9]*\.?[0-9]+)[eE]([-+]?[0-9]+)'
- def scientific_to_number(input_str):
- if input_str:
- match = re.match(scientific_pattern, input_str)
- if match:
- base_number = float(match.group(1))
- exponent = int(match.group(2))
- result = base_number * (10 ** exponent)
- return str(int(result))
- else:
- return input_str
- return None
- pattern_space = r'(?<!\d)\s+|\s+(?!\d)'
- # pattern_keep_space = r'(\d)\s+(\d)'
- # pattern_remove_space = r'([^\d])\s+([^\d])'
- # 判断电话分隔符,如果特定分隔符前后都是7位,则用@@分隔,方便后续炸开
- def judge_delimiter(tel_str):
- if tel_str:
- # 正则判断空格,只保留数字之前的空格
- tel_str=re.sub(pattern_space, '', tel_str)
- # 正则拆分
- parts = re.split(r'[/,\s@&;]+', tel_str)
- # 用于存储处理后的字符串
- new_parts = []
- # 遍历分割后的字符串列表
- for i in range(len(parts)):
- # 检查当前部分是否为空,如果是则跳过
- if not parts[i]:
- continue
- # 检查当前部分和下一个部分的长度是否都大于等于6
- if i < len(parts) - 1 and len(parts[i]) >= 6 and len(parts[i + 1]) >= 6:
- # 如果是,则将当前部分和下一个部分用@@连接
- new_parts.append(parts[i] + '@@')
- else:
- # 如果不是,则添加当前部分
- new_parts.append(parts[i]+' ')
- # 将处理后的字符串部分重新组合成一个字符串
- return ''.join(new_parts)
- return None
- if __name__ == '__main__1':
- test_case_list = [
- '666666 7777777',
- '262-255- 7177 // 273308256',
- 'abc 123 456 def' ,
- '',
- None
- ]
- for str_tel in test_case_list:
- print(f'tel: {str_tel} ---->{judge_delimiter(str_tel)}')
- # 判断电话或传真位数
- def judge_tel_length(str):
- if str:
- length_str = re.sub(r'[^\d]', '', str)
- if len(length_str) < 6:
- return None
- else:
- return str
- return None
- # 删除字符串首位特殊符号
- remove_chars = ' :/-;?@#>.,*'
- def clean_headtail(str):
- if str:
- remove_str = str.strip(remove_chars)
- str = remove_str.lstrip(')').rstrip('(')
- return str
- return None
- if __name__ == '__main__1':
- test_case_list = [
- '123-243',
- '(123345)',
- '(010)1(2)3345',
- '(010)1(2)334(5)',
- '(010)123345)',
- '(010)123345(',
- '(010123345',
- ')010123345',
- '472601(',
- '',
- None,
- '913207067724649993*'
- ]
- for str_tel in test_case_list:
- print(f':{str_tel}---->{clean_headtail(str_tel)}')
- tel_bad_list=[
- ':',
- ';',
- ',',
- '.',
- '?',
- '//',
- '()',
- '( )',
- '�'
- ]
- def col_tel_clean(tel_str):
- if tel_str:
- if 'e+' in tel_str.lower():
- tel_str = scientific_to_number(tel_str)
- cleaned_zero = re.sub(r'\.0+$', '', tel_str)
- for bad in tel_bad_list:
- cleaned_zero = cleaned_zero.replace(bad, ' ')
- clean_letter = re.sub(r'[a-zA-Z]', '', cleaned_zero)
- clean_headtail = clean_letter.lstrip('/-;?@#>').rstrip('/-;?@#>')
- clean_blank = re.sub(r'\s+', ' ', clean_headtail).strip()
- tel_str = judge_delimiter(clean_blank)
- if tel_str:
- # 判断位数
- length_str = re.sub(r'[^\d]', '', tel_str)
- if len(length_str) < 6:
- return None
- else:
- return tel_str
- return None
- return None
- if __name__ == '__main__1':
- test_case_list = [
- 'Fax: +1 780 468 9165',
- 'FAX.9545852544',
- 'FAX/5618446131',
- 'Fax : 833.338.8901',
- 'Fax/: 833.338.8901',
- '(615) 316-5100 // FAX (615) 31',
- 'TEL: 507-69828001',
- '6914 1002 TAX ID:200514854D',
- 'Fax No: +86 (0) 527.84495888',
- 'RUT:76.631.726-K',
- 'FAX. 41 32 392 51 07B>',
- 'FAX9545852544/46',
- 'FAXSIN FAX',
- 'LONGROnO 3871232',
- '6910500.0000',
- '6910500.0',
- '3.203177e+11',
- '3.19213916545e+11',
- '1230000',
- '5397-4880,5397-1333',
- '',
- None
- ]
- for str_tel in test_case_list:
- print(f'tel: {str_tel} ---->{col_tel_clean(str_tel)}')
- col_bad_email=[
- '@','*','-','.',','
- ]
- def col_email_clean(email):
- if email:
- email = email.lower().replace('@@', '@').replace(',.', '.').replace('.,', '.')
- for badstr in col_bad_email:
- if email.startswith(badstr)|email.endswith(badstr):
- email=email.replace(badstr,'')
- if '.' not in email:
- return None
- if email.count('@') == 1:
- email = email.replace(',', '.')
- # 标准邮箱
- if re.search(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]*', email):
- return email
- if email.count('@') >= 2:
- # CONTADOR@JANSENANDRE@HOTMAIL.COM
- if re.search( r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+@[a-zA-Z.]+', email):
- return None
- # 标准邮箱
- email_pattern = re.compile(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]{1,3}')
- email = email_pattern.findall(email)
- if email:
- return ','.join(email)
- return None
- if __name__ == '__main__1':
- test_case_list = [
- 'info@gcrcompact@gcrieber.com',
- 'HITECH@MOLDER.COM.HK, HITECH@M',
- 'siki.huang@byd.com / betty.qiu@b',
- 'sales@dayusainc.com <sales@day',
- 'cora@38f.net,fini@39f.net',
- 'italmaq.amm@gmail.com',
- 'info@papeleradelpacifico.com,',
- 'WWW,IMPERIO_CARGO@GMAIL.COM',
- 'SUCDEN@SUCDEN.COM. AMERICAS@SUCD',
- 'COMEX@TELMAC.COM.BR/ COMEX4@TE',
- 'abby@jinshen.cnabby@jinshenmc',
- '',
- None
- ]
- for str_tel in test_case_list:
- print(f'tel: {str_tel} ---->{col_email_clean(str_tel)}')
- #匹配含电话号的传真号码即同时含tel|ph和FAX 取出fax后面的传真号码
- tel_fax_pattern1 = re.compile(r'(ph|tel)(.*)[(]?fax[)]?(.*)', re.IGNORECASE)
- tel_fax_pattern2 = re.compile(r'[^ph|tel]tel[e]?[\s]?[&(]?fax[\s:.)]?[n]?[o]?[\s:.]?', re.IGNORECASE)
- #匹配只有fax的传真号码
- fax_pattern = re.compile(r'(fax)', re.IGNORECASE)
- # 印度jksdh提取fax
- def ind_getfax_jksdh(tel_str):
- if tel_str:
- tel_fax_match1 = re.search(tel_fax_pattern1, tel_str)
- tel_fax_match2 = re.search(tel_fax_pattern2, tel_str)
- fax_match1 = re.search(fax_pattern, tel_str)
- # 既有电话又有传真时或取传真
- if tel_fax_match1:
- # 如果tel和fax连在一起,视为传真把这部分替换为@@
- if tel_fax_match2:
- split_fax = re.sub(tel_fax_pattern2, '@@', tel_str)
- # 将其余字母替换成空
- split_fax_cleanletter = re.sub(r'[a-zA-Z]', '', split_fax)
- split_fax_enter = re.sub(r'\s+', ' ', split_fax_cleanletter)
- return split_fax_enter.strip(remove_chars)
- get_afterfax = tel_fax_match1.group(3)
- clean_afterfax = re.sub(r'[a-zA-Z]', '', get_afterfax)
- return clean_afterfax.strip(remove_chars)
- # 只有fax 传真
- if fax_match1:
- split_fax = re.sub(fax_pattern, '@@', tel_str)
- split_fax_cleanletter = re.sub(r'[a-zA-Z]', '', split_fax)
- return split_fax_cleanletter.strip(remove_chars)
- return None
- def ind_fax_jksdh_clean(jksdh):
- fax = judge_delimiter(jksdh)
- if fax:
- for bad in tel_bad_list:
- fax = fax.replace(bad, '')
- return fax
- return None
- if __name__ == '__main__1':
- test_case_list = [
- '011-23557208,telefax0129-2279612 to 615',
- '02-65111032/020-65111033 tel fax',
- '033-2358-7784, 03323587789(telefax)',
- '431222 EXTEN:201/431821(D)/FAX NO.091-0422-431672',
- 'PH 080-91133444 FAX 080-91133502',
- 'PH. 011-514 6164, 514 6165, 540 0984, FAX. 011- 549 2977',
- 'TEL:91-22-4921900-05 FAX:91-22-4939284,4950594',
- 'TELE FAX-91-22-27681365/27686787',
- 'TELE:66011640, FAX:26057125',
- 'TELEFAX+914428143552/+914443502454',
- 'TELEFAX-3432782/02223423810/02223432782',
- 'TELEFAX-4074329',
- 'Tel : (044) 823 2117 Fax (044) 823 4411',
- 'Tel : 080-3349348 / Fax : 080-3348607',
- 'Tel : 080-3349348; Fax : 080-3348607',
- 'Tel : 22-8731998 Fax : 022-8711911',
- 'Tel: 344 3644, Fax no: 342 9023',
- 'Telefax 4930742',
- '42011184,42011135,42157331/TELEFAX NO.28584954/MOBILE 9840104275',
- '07232-44134/44247fax45430',
- '011-23557208,telefax0129-2279612 to 615',
- '40460655 tel fax no 21021042',
- '022-25890222 FAX NO.022-25890411',
- '28271933 FAX NO. 28302531/32',
- '',
- None
- ]
- for str_tel in test_case_list:
- print(f'str: {str_tel} ---->{ind_getfax_jksdh(str_tel)}')
- # 印度jksdh提取phone
- def ind_gettel_jksdh(tel_str):
- if tel_str:
- tel_fax_match1 = re.search(tel_fax_pattern1, tel_str)
- # 既有电话又有传真 取电话
- if tel_fax_match1:
- get_aftertel = tel_fax_match1.group(2)
- clean_aftertel = re.sub(r'[a-zA-Z]', '', get_aftertel)
- return clean_aftertel.strip(remove_chars)
- # jksdh不含fax
- clean_letter = re.sub(r'[a-zA-Z]', ' ', tel_str)
- clean_enter = re.sub(r'\s+', ' ', clean_letter)
- return clean_enter.strip(remove_chars)
- return None
- def ind_tel_jksdh_clean(jksdh):
- tel_str = ind_gettel_jksdh(jksdh)
- tel = judge_delimiter(tel_str)
- if tel:
- for bad in tel_bad_list:
- tel = tel.replace(bad, '')
- return tel
- return None
- if __name__ == '__main__1':
- test_case_list = [
- '(0161) 662154, 660637 & 664538',
- '',
- None
- ]
- for str_tel in test_case_list:
- print(f'tel: {str_tel} ---->{ind_tel_jksdh_clean(str_tel)}')
- if __name__ == '__main__':
- test_case_list = [
- '011-23557208,telefax0129-2279612 to 615',
- '431222 EXTEN:201/431821(D)/FAX NO.091-0422-431672',
- 'PH 080-91133444 FAX 080-91133502',
- 'PH. 011-514 6164, 514 6165, 540 0984, FAX. 011- 549 2977',
- 'TEL:91-22-4921900-05 FAX:91-22-4939284,4950594',
- 'TELE FAX-91-22-27681365/27686787',
- 'TELE:66011640, FAX:26057125',
- 'Tel : (044) 823 2117 Fax (044) 823 4411',
- 'Tel : 080-3349348 / Fax : 080-3348607',
- 'Tel : 080-3349348; Fax : 080-3348607',
- 'Tel : 22-8731998 Fax : 022-8711911',
- 'Tel: 344 3644, Fax no: 342 9023',
- '25594911 TO 916',
- '8012997/f-8626376',
- '',
- None
- ]
- for str_tel in test_case_list:
- print(f'tel: {str_tel} ---->{ind_tel_jksdh_clean(str_tel)}')
- def ind_fax_jkscz_clean(jksdh):
- if jksdh:
- clean_letter = re.sub(r'[a-zA-Z]', ' ', jksdh)
- clean_enter = re.sub(r'\s+', ' ', clean_letter)
- tel = judge_delimiter(clean_enter)
- if tel:
- for bad in tel_bad_list:
- tel = tel.replace(bad, '')
- return tel
- return None
- def pry_phone_clean(jksdh):
- if jksdh:
- clean_letter = re.sub(r'[a-zA-Z]', ' ', jksdh)
- clean_enter = re.sub(r'\s+', ' ', clean_letter)
- tel = judge_tel_length(clean_enter)
- if tel:
- for bad in tel_bad_list:
- tel = tel.replace(bad, '')
- return tel
- return None
- if __name__ == '__main__1':
- test_case_list = [
- '1234to2',
- '',
- None
- ]
- for str_tel in test_case_list:
- print(f'tel: {str_tel} ---->{pry_phone_clean(str_tel)}')
- month_dict = {
- 'JAN': '01',
- 'FEB': '02',
- 'MAR': '03',
- 'APR': '04',
- 'MAY': '05',
- 'JUN': '06',
- 'JUL': '07',
- 'AUG': '08',
- 'SEP': '09',
- 'OCT': '10',
- 'NOV': '11',
- 'DEC': '12'
- }
|