cts_common.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import json
  2. import re
  3. from pyspark.sql.functions import udf
  4. from pyspark.sql.types import *
  5. @udf(returnType=ArrayType(StringType()))
  6. def str_to_arr(json_str: str) -> list:
  7. if json_str:
  8. return json.loads(json_str)
  9. return []
  10. @udf(returnType=ArrayType(MapType(StringType(), StringType())))
  11. def str_to_map_arr(json_str: str) -> list:
  12. if json_str:
  13. return json.loads(json_str)
  14. return []
  15. def merge_ws(text: str):
  16. if text:
  17. return ' '.join(text.split())
  18. return None
  19. @udf(returnType=ArrayType(StringType()))
  20. def explode_str_to_arr(text: str) -> list:
  21. if text is None:
  22. return []
  23. if len(text) <= 8:
  24. return [text]
  25. #大于8位时,从后往前,每少一位截取一个字符串,存入数组中
  26. return [text[:i] for i in range(len(text), 7, -1)]
  27. def remove_special_char(text,char):
  28. if text is not None and text.endswith(char):
  29. return text[:-1]
  30. return text
  31. if __name__ == '__main__':
  32. # arr = str_to_arr('[{"email":"aline@forusi.com.br","type":"prospect","status":"verified","position":"Analista de Recursos Humanos","firstName":"Aline","lastName":"Cavalheiro","companyName":"Forusi","sourcePage":"https://www.linkedin.com/in/aline-cavalheiro-bb3644b8"},{"email":"karina@forusi.com.br","type":"prospect","status":"verified","position":"Coordenadora de vendas","firstName":"Karina","lastName":"Evangelista de Oliveira","companyName":"Forusi","sourcePage":"https://www.linkedin.com/in/karina-evangelista-de-oliveira-412934a6"},{"email":"raphael@forusi.com.br","type":"prospect","status":"verified","position":"Comprador Pleno","firstName":"Raphael","lastName":"Mendonça","companyName":"Forusi","sourcePage":"https://www.linkedin.com/in/raphael-mendon%C3%A7a-a7b882116"}]')
  33. # print(type(arr))
  34. arr = explode_str_to_arr('fsdfsafas')
  35. print(arr)