import codecs import re import json from pyspark.sql.functions import udf from pyspark.sql.types import ArrayType, StringType # 科学计数法转数字 scientific_pattern = r'([0-9]*\.?[0-9]+)[eE]([-+]?[0-9]+)' def scientific_to_number(input_str): if input_str: match = re.match(scientific_pattern, input_str) if match: base_number = float(match.group(1)) exponent = int(match.group(2)) result = base_number * (10 ** exponent) return str(int(result)) else: return input_str return None pattern_space = r'(?= 6 and len(parts[i + 1]) >= 6: # 如果是,则将当前部分和下一个部分用@@连接 new_parts.append(parts[i] + '@@') else: # 如果不是,则添加当前部分 new_parts.append(parts[i]+' ') # 将处理后的字符串部分重新组合成一个字符串 return ''.join(new_parts) return None if __name__ == '__main__1': test_case_list = [ '666666 7777777', '262-255- 7177 // 273308256', 'abc 123 456 def' , '', None ] for str_tel in test_case_list: print(f'tel: {str_tel} ---->{judge_delimiter(str_tel)}') # 判断电话或传真位数 def judge_tel_length(str): if str: length_str = re.sub(r'[^\d]', '', str) if len(length_str) < 6: return None else: return str return None # 删除字符串首位特殊符号 remove_chars = ' :/-;?@#>.,*' def clean_headtail(str): if str: remove_str = str.strip(remove_chars) str = remove_str.lstrip(')').rstrip('(') return str return None if __name__ == '__main__1': test_case_list = [ '123-243', '(123345)', '(010)1(2)3345', '(010)1(2)334(5)', '(010)123345)', '(010)123345(', '(010123345', ')010123345', '472601(', '', None, '913207067724649993*' ] for str_tel in test_case_list: print(f':{str_tel}---->{clean_headtail(str_tel)}') tel_bad_list=[ ':', ';', ',', '.', '?', '//', '()', '( )', '�' ] def col_tel_clean(tel_str): if tel_str: if 'e+' in tel_str.lower(): tel_str = scientific_to_number(tel_str) cleaned_zero = re.sub(r'\.0+$', '', tel_str) for bad in tel_bad_list: cleaned_zero = cleaned_zero.replace(bad, ' ') clean_letter = re.sub(r'[a-zA-Z]', '', cleaned_zero) clean_headtail = clean_letter.lstrip('/-;?@#>').rstrip('/-;?@#>') clean_blank = re.sub(r'\s+', ' ', clean_headtail).strip() tel_str = judge_delimiter(clean_blank) if tel_str: # 判断位数 length_str = re.sub(r'[^\d]', '', tel_str) if len(length_str) < 6: return None else: return tel_str return None return None if __name__ == '__main__1': test_case_list = [ 'Fax: +1 780 468 9165', 'FAX.9545852544', 'FAX/5618446131', 'Fax : 833.338.8901', 'Fax/: 833.338.8901', '(615) 316-5100 // FAX (615) 31', 'TEL: 507-69828001', '6914 1002 TAX ID:200514854D', 'Fax No: +86 (0) 527.84495888', 'RUT:76.631.726-K', 'FAX. 41 32 392 51 07B>', 'FAX9545852544/46', 'FAXSIN FAX', 'LONGROnO 3871232', '6910500.0000', '6910500.0', '3.203177e+11', '3.19213916545e+11', '1230000', '5397-4880,5397-1333', '', None ] for str_tel in test_case_list: print(f'tel: {str_tel} ---->{col_tel_clean(str_tel)}') col_bad_email=[ '@','*','-','.',',' ] def col_email_clean(email): if email: email = email.lower().replace('@@', '@').replace(',.', '.').replace('.,', '.') for badstr in col_bad_email: if email.startswith(badstr)|email.endswith(badstr): email=email.replace(badstr,'') if '.' not in email: return None if email.count('@') == 1: email = email.replace(',', '.') # 标准邮箱 if re.search(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]*', email): return email if email.count('@') >= 2: # CONTADOR@JANSENANDRE@HOTMAIL.COM if re.search( r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+@[a-zA-Z.]+', email): return None # 标准邮箱 email_pattern = re.compile(r'[a-zA-Z0-9]+[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._%+-]+\.[a-zA-Z]{1,3}') email = email_pattern.findall(email) if email: return ','.join(email) return None if __name__ == '__main__1': test_case_list = [ 'info@gcrcompact@gcrieber.com', 'HITECH@MOLDER.COM.HK, HITECH@M', 'siki.huang@byd.com / betty.qiu@b', 'sales@dayusainc.com {col_email_clean(str_tel)}') #匹配含电话号的传真号码即同时含tel|ph和FAX 取出fax后面的传真号码 tel_fax_pattern1 = re.compile(r'(ph|tel)(.*)[(]?fax[)]?(.*)', re.IGNORECASE) tel_fax_pattern2 = re.compile(r'[^ph|tel]tel[e]?[\s]?[&(]?fax[\s:.)]?[n]?[o]?[\s:.]?', re.IGNORECASE) #匹配只有fax的传真号码 fax_pattern = re.compile(r'(fax)', re.IGNORECASE) # 印度jksdh提取fax def ind_getfax_jksdh(tel_str): if tel_str: tel_fax_match1 = re.search(tel_fax_pattern1, tel_str) tel_fax_match2 = re.search(tel_fax_pattern2, tel_str) fax_match1 = re.search(fax_pattern, tel_str) # 既有电话又有传真时或取传真 if tel_fax_match1: # 如果tel和fax连在一起,视为传真把这部分替换为@@ if tel_fax_match2: split_fax = re.sub(tel_fax_pattern2, '@@', tel_str) # 将其余字母替换成空 split_fax_cleanletter = re.sub(r'[a-zA-Z]', '', split_fax) split_fax_enter = re.sub(r'\s+', ' ', split_fax_cleanletter) return split_fax_enter.strip(remove_chars) get_afterfax = tel_fax_match1.group(3) clean_afterfax = re.sub(r'[a-zA-Z]', '', get_afterfax) return clean_afterfax.strip(remove_chars) # 只有fax 传真 if fax_match1: split_fax = re.sub(fax_pattern, '@@', tel_str) split_fax_cleanletter = re.sub(r'[a-zA-Z]', '', split_fax) return split_fax_cleanletter.strip(remove_chars) return None def ind_fax_jksdh_clean(jksdh): fax = judge_delimiter(jksdh) if fax: for bad in tel_bad_list: fax = fax.replace(bad, '') return fax return None if __name__ == '__main__1': test_case_list = [ '011-23557208,telefax0129-2279612 to 615', '02-65111032/020-65111033 tel fax', '033-2358-7784, 03323587789(telefax)', '431222 EXTEN:201/431821(D)/FAX NO.091-0422-431672', 'PH 080-91133444 FAX 080-91133502', 'PH. 011-514 6164, 514 6165, 540 0984, FAX. 011- 549 2977', 'TEL:91-22-4921900-05 FAX:91-22-4939284,4950594', 'TELE FAX-91-22-27681365/27686787', 'TELE:66011640, FAX:26057125', 'TELEFAX+914428143552/+914443502454', 'TELEFAX-3432782/02223423810/02223432782', 'TELEFAX-4074329', 'Tel : (044) 823 2117 Fax (044) 823 4411', 'Tel : 080-3349348 / Fax : 080-3348607', 'Tel : 080-3349348; Fax : 080-3348607', 'Tel : 22-8731998 Fax : 022-8711911', 'Tel: 344 3644, Fax no: 342 9023', 'Telefax 4930742', '42011184,42011135,42157331/TELEFAX NO.28584954/MOBILE 9840104275', '07232-44134/44247fax45430', '011-23557208,telefax0129-2279612 to 615', '40460655 tel fax no 21021042', '022-25890222 FAX NO.022-25890411', '28271933 FAX NO. 28302531/32', '', None ] for str_tel in test_case_list: print(f'str: {str_tel} ---->{ind_getfax_jksdh(str_tel)}') # 印度jksdh提取phone def ind_gettel_jksdh(tel_str): if tel_str: tel_fax_match1 = re.search(tel_fax_pattern1, tel_str) # 既有电话又有传真 取电话 if tel_fax_match1: get_aftertel = tel_fax_match1.group(2) clean_aftertel = re.sub(r'[a-zA-Z]', '', get_aftertel) return clean_aftertel.strip(remove_chars) # jksdh不含fax clean_letter = re.sub(r'[a-zA-Z]', ' ', tel_str) clean_enter = re.sub(r'\s+', ' ', clean_letter) return clean_enter.strip(remove_chars) return None def ind_tel_jksdh_clean(jksdh): tel_str = ind_gettel_jksdh(jksdh) tel = judge_delimiter(tel_str) if tel: for bad in tel_bad_list: tel = tel.replace(bad, '') return tel return None if __name__ == '__main__1': test_case_list = [ '(0161) 662154, 660637 & 664538', '', None ] for str_tel in test_case_list: print(f'tel: {str_tel} ---->{ind_tel_jksdh_clean(str_tel)}') if __name__ == '__main__': test_case_list = [ '011-23557208,telefax0129-2279612 to 615', '431222 EXTEN:201/431821(D)/FAX NO.091-0422-431672', 'PH 080-91133444 FAX 080-91133502', 'PH. 011-514 6164, 514 6165, 540 0984, FAX. 011- 549 2977', 'TEL:91-22-4921900-05 FAX:91-22-4939284,4950594', 'TELE FAX-91-22-27681365/27686787', 'TELE:66011640, FAX:26057125', 'Tel : (044) 823 2117 Fax (044) 823 4411', 'Tel : 080-3349348 / Fax : 080-3348607', 'Tel : 080-3349348; Fax : 080-3348607', 'Tel : 22-8731998 Fax : 022-8711911', 'Tel: 344 3644, Fax no: 342 9023', '25594911 TO 916', '8012997/f-8626376', '', None ] for str_tel in test_case_list: print(f'tel: {str_tel} ---->{ind_tel_jksdh_clean(str_tel)}') def ind_fax_jkscz_clean(jksdh): if jksdh: clean_letter = re.sub(r'[a-zA-Z]', ' ', jksdh) clean_enter = re.sub(r'\s+', ' ', clean_letter) tel = judge_delimiter(clean_enter) if tel: for bad in tel_bad_list: tel = tel.replace(bad, '') return tel return None def pry_phone_clean(jksdh): if jksdh: clean_letter = re.sub(r'[a-zA-Z]', ' ', jksdh) clean_enter = re.sub(r'\s+', ' ', clean_letter) tel = judge_tel_length(clean_enter) if tel: for bad in tel_bad_list: tel = tel.replace(bad, '') return tel return None if __name__ == '__main__1': test_case_list = [ '1234to2', '', None ] for str_tel in test_case_list: print(f'tel: {str_tel} ---->{pry_phone_clean(str_tel)}') month_dict = { 'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12' }