import codecs import re import json from pyspark.sql.functions import udf from pyspark.sql.types import ArrayType, StringType hgbm_pattern=r'[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+[a-zA-Z]+$' def hgbm_clean(hgbm): if hgbm: if not hgbm or not any(char.isdigit() for char in hgbm): # 如果 hgbm 不含数字,返回 None return None if 'e+' in hgbm.lower(): # 如果是科学计数法,截取前 8 位数字 return hgbm.split('e+')[0][:8] if re.search(hgbm_pattern, hgbm): return hgbm.replace('.','') hgbm = re.sub(r'[\.;;,,/\s+]', ',', hgbm) hgbm = hgbm.strip(')&\'_') if '-' in hgbm: if len(hgbm) >= 12: hgbm = hgbm.replace('-',',') else: hgbm = hgbm.replace('-', '') return hgbm return None if __name__ == '__main__': test_list = [ '843290108512e+31', '6212.90.00.000A', '6212.90.00.000AQ', '6212.90.00.000A1', '94069/730830/854449', '940161000019 940360100000 940350000019', '630532;630532,630532,630532,630532,630532,630532,630532,630532,630532', '320649/320690/38245905/320619', '39173900-39209990-39269097', '68030000)', '39262000001', '9403-6000', '33011920&', '34013000_', '\'39262000001', '69072100+44152020' ] for str in test_list: print(f'{str}---->{hgbm_clean(str)}')