| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import codecs
- import re
- import json
- from pyspark.sql.functions import udf
- from pyspark.sql.types import ArrayType, StringType
- hgbm_pattern=r'[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+[a-zA-Z]+$'
- def hgbm_clean(hgbm):
- if hgbm:
- if not hgbm or not any(char.isdigit() for char in hgbm):
- # 如果 hgbm 不含数字,返回 None
- return None
- if 'e+' in hgbm.lower():
- # 如果是科学计数法,截取前 8 位数字
- return hgbm.split('e+')[0][:8]
- if re.search(hgbm_pattern, hgbm):
- return hgbm.replace('.','')
- hgbm = re.sub(r'[\.;;,,/\s+]', ',', hgbm)
- hgbm = hgbm.strip(')&\'_')
- if '-' in hgbm:
- if len(hgbm) >= 12:
- hgbm = hgbm.replace('-',',')
- else:
- hgbm = hgbm.replace('-', '')
- return hgbm
- return None
- if __name__ == '__main__':
- test_list = [
- '843290108512e+31',
- '6212.90.00.000A',
- '6212.90.00.000AQ',
- '6212.90.00.000A1',
- '94069/730830/854449',
- '940161000019 940360100000 940350000019',
- '630532;630532,630532,630532,630532,630532,630532,630532,630532,630532',
- '320649/320690/38245905/320619',
- '39173900-39209990-39269097',
- '68030000)',
- '39262000001',
- '9403-6000',
- '33011920&',
- '34013000_',
- '\'39262000001',
- '69072100+44152020'
- ]
- for str in test_list:
- print(f'{str}---->{hgbm_clean(str)}')
|