cts_data_clean.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import codecs
  2. import re
  3. import json
  4. from pyspark.sql.functions import udf
  5. from pyspark.sql.types import ArrayType, StringType
  6. hgbm_pattern=r'[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+[a-zA-Z]+$'
  7. def hgbm_clean(hgbm):
  8. if hgbm:
  9. if not hgbm or not any(char.isdigit() for char in hgbm):
  10. # 如果 hgbm 不含数字,返回 None
  11. return None
  12. if 'e+' in hgbm.lower():
  13. # 如果是科学计数法,截取前 8 位数字
  14. return hgbm.split('e+')[0][:8]
  15. if re.search(hgbm_pattern, hgbm):
  16. return hgbm.replace('.','')
  17. hgbm = re.sub(r'[\.;;,,/\s+]', ',', hgbm)
  18. hgbm = hgbm.strip(')&\'_')
  19. if '-' in hgbm:
  20. if len(hgbm) >= 12:
  21. hgbm = hgbm.replace('-',',')
  22. else:
  23. hgbm = hgbm.replace('-', '')
  24. return hgbm
  25. return None
  26. if __name__ == '__main__':
  27. test_list = [
  28. '843290108512e+31',
  29. '6212.90.00.000A',
  30. '6212.90.00.000AQ',
  31. '6212.90.00.000A1',
  32. '94069/730830/854449',
  33. '940161000019 940360100000 940350000019',
  34. '630532;630532,630532,630532,630532,630532,630532,630532,630532,630532',
  35. '320649/320690/38245905/320619',
  36. '39173900-39209990-39269097',
  37. '68030000)',
  38. '39262000001',
  39. '9403-6000',
  40. '33011920&',
  41. '34013000_',
  42. '\'39262000001',
  43. '69072100+44152020'
  44. ]
  45. for str in test_list:
  46. print(f'{str}---->{hgbm_clean(str)}')