spark_tid_match_udf.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. """
  2. 批量匹配tid
  3. """
  4. import hashlib
  5. import json
  6. from functools import lru_cache
  7. from dw_base.spark.udf.enterprise.unique.ent_offline_udf_indonesia import clean_company_name_idn, generate_tid_idn
  8. from dw_base.utils.tid_utils import TidGeneratorFactory
  9. mapping = {}
  10. tid_generator = TidGeneratorFactory().createTidGenerator('Enterprise')
  11. def generate_tid(website, name, country_code3):
  12. if not name:
  13. return None
  14. if country_code3 in ['IDN']:
  15. cleaned_name = clean_company_name_idn(name)
  16. return match_tid(name, cleaned_name, country_code3)
  17. else:
  18. return old_generate_tid(website, name, country_code3)
  19. def generate_md5_hash(input_str: str):
  20. md5_hash = hashlib.md5(input_str.encode('utf-8'))
  21. return md5_hash.hexdigest()
  22. def old_generate_tid(website, name, country_code3):
  23. if not name:
  24. return None
  25. input_str = website if website else f"{name}-{country_code3 if country_code3 else ''}"
  26. return generate_md5_hash(input_str)
  27. def match_tid(name: str, cleaned_name: str, country: str):
  28. tid = cache_tid(name, cleaned_name, country)
  29. if not tid:
  30. return generate_tid_idn(cleaned_name, None, None)
  31. return tid
  32. @lru_cache(maxsize=1000000)
  33. def cache_tid(name: str, cleaned_name: str, country: str):
  34. key = '%s--%s' % (
  35. name if name else "",
  36. country if country else ""
  37. )
  38. cleaned_key = '%s--%s' % (cleaned_name if cleaned_name else "",
  39. country if country else "")
  40. tid = mapping.get(key) or mapping.get(cleaned_key)
  41. if tid is None:
  42. # 如果mapping里没有该tid,进行匹配
  43. tid = tid_generator.match_tid(name, country)
  44. if tid is None:
  45. # 如果匹配结果为null,则向mapping写入一个空字符串
  46. tid = tid_generator.match_tid(cleaned_name, country)
  47. if tid is None:
  48. mapping[key] = ''
  49. mapping[cleaned_key] = ''
  50. else:
  51. mapping[cleaned_key] = tid
  52. else:
  53. mapping[key] = tid
  54. elif tid == '':
  55. # 对于第一次没有匹配到tid的公司,第二次进入该方法会得到一个空字符串,此时应返回null
  56. return None
  57. return tid
  58. if __name__ == '__main__':
  59. print(generate_tid('', 'KENCANA LINTASINDO INTERNASIONAL', 'IDN'))