ctc_common.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. import hashlib
  2. import json
  3. import re
  4. from pyspark.sql.functions import udf
  5. from pyspark.sql.types import *
  6. special_chars = ['.',
  7. ',',
  8. '-',
  9. '(',
  10. ')',
  11. '@',
  12. '?',
  13. '‘',
  14. '’',
  15. '“',
  16. '”',
  17. '`',
  18. '#',
  19. '+',
  20. '!',
  21. '$',
  22. '|',
  23. ':',
  24. '/',
  25. ';',
  26. '*',
  27. '《',
  28. '》',
  29. '<',
  30. '>',
  31. '`',
  32. '#',
  33. '+',
  34. '!',
  35. '$',
  36. '|',
  37. ':',
  38. '/',
  39. ';',
  40. '*',
  41. '《',
  42. '》',
  43. '<',
  44. '>',
  45. '%',
  46. '^',
  47. '&',
  48. '_',
  49. '[',
  50. ']',
  51. '{',
  52. '}',
  53. '\\',
  54. '~',
  55. '=',
  56. "'",
  57. '±',
  58. '°',
  59. '«',
  60. '»',
  61. 'µ',
  62. '¶',
  63. '·',
  64. '€',
  65. '£',
  66. '¥',
  67. '¢',
  68. '×',
  69. '÷',
  70. '±',
  71. '¬',
  72. '…',
  73. '→',
  74. '←',
  75. '↑',
  76. '↓',
  77. '↔',
  78. '⇒',
  79. '⇐',
  80. '≈',
  81. '≠',
  82. '≤',
  83. '≥',
  84. '¨',
  85. '´',
  86. '.',
  87. ',',
  88. '-',
  89. '(',
  90. ')',
  91. '@',
  92. '?',
  93. "'",
  94. "'",
  95. '"',
  96. '"',
  97. ''',
  98. '#',
  99. '+',
  100. '!',
  101. '$',
  102. '|',
  103. ':',
  104. '/',
  105. ';',
  106. '*',
  107. '',
  108. '',
  109. '<',
  110. '>',
  111. "'",
  112. '#',
  113. '+',
  114. '!',
  115. '$',
  116. '|',
  117. ':',
  118. '/',
  119. ';',
  120. '*',
  121. '',
  122. '',
  123. '<',
  124. '>',
  125. '%',
  126. '^',
  127. '&',
  128. '_',
  129. '[',
  130. ']',
  131. '{',
  132. '}',
  133. '\',
  134. '~',
  135. '=',
  136. "'",
  137. '±',
  138. '°',
  139. '«',
  140. '»',
  141. 'µ',
  142. '¶',
  143. '·',
  144. '€',
  145. '£',
  146. '¥',
  147. '¢',
  148. '×',
  149. '÷',
  150. '±',
  151. '¬',
  152. '…',
  153. '→',
  154. '←',
  155. '↑',
  156. '↓',
  157. '↔',
  158. '⇒',
  159. '⇐',
  160. '≈',
  161. '≠',
  162. '≤',
  163. '≥']
  164. special_chars = set(special_chars)
  165. @udf(returnType=ArrayType(StringType()))
  166. def str_to_json_arr(json_str: str) -> list:
  167. try:
  168. if json_str:
  169. res = []
  170. for j in json.loads(json_str):
  171. res.append(json.dumps(j, ensure_ascii=False))
  172. return res
  173. except json.JSONDecodeError as e:
  174. # 处理JSON解析错误
  175. print(f"JSONDecodeError: {e}")
  176. except Exception as e:
  177. # 处理其他异常
  178. print(f"Unexpected error: {e}")
  179. return []
  180. @udf(returnType=ArrayType(StringType()))
  181. def str_to_arr(json_str: str) -> list:
  182. try:
  183. if json_str:
  184. return json.loads(json_str)
  185. except json.JSONDecodeError as e:
  186. # 处理JSON解析错误
  187. print(f"JSONDecodeError: {e}")
  188. except Exception as e:
  189. # 处理其他异常
  190. print(f"Unexpected error: {e}")
  191. return []
  192. @udf(returnType=ArrayType(MapType(StringType(), StringType())))
  193. def str_to_map_arr(json_str: str) -> list:
  194. try:
  195. if json_str:
  196. return json.loads(json_str)
  197. return []
  198. except json.JSONDecodeError as e:
  199. # Handle JSON decoding error
  200. print(f"JSONDecodeError: {e}")
  201. return []
  202. except Exception as e:
  203. # Handle other exceptions
  204. print(f"Unexpected error: {e}")
  205. return []
  206. def merge_ws(text: str):
  207. if text:
  208. return ' '.join(text.split())
  209. return None
  210. def uppercase_first_letter(word):
  211. word = word.lower()
  212. return word[:1].upper() + word[1:]
  213. def remove_special_chars(word):
  214. return ''.join(ch for ch in word if ch not in special_chars)
  215. def clean_contact_name(contact_name):
  216. if contact_name:
  217. names = contact_name.split()
  218. cleaned_names = [remove_special_chars(name) for name in names]
  219. upper_names = [uppercase_first_letter(name) for name in cleaned_names]
  220. cleaned_names = ' '.join(upper_names)
  221. return ' '.join(cleaned_names.split())
  222. return None
  223. def clean_email_status(source, match_level):
  224. if match_level:
  225. if source == 'shh':
  226. try:
  227. match_level = float(match_level)
  228. if match_level == 1:
  229. return 'PERFECT_MATCH'
  230. elif match_level in (2, -1):
  231. return 'SPECULATION_VERIFICATION'
  232. elif match_level >= 0.9 and match_level < 1:
  233. return 'POSSIBLE_MATCH'
  234. else:
  235. return 'LOW_MATCH'
  236. except ValueError:
  237. return None
  238. elif source == 'snovio':
  239. if match_level in ('valid', 'verified'):
  240. return 'PERFECT_MATCH'
  241. elif match_level in ('not_valid', 'greylisted', 'notVerified'):
  242. return 'SPECULATION_VERIFICATION'
  243. else:
  244. return 'LOW_MATCH'
  245. return None
  246. def clean_shh_ep(ep):
  247. if ep:
  248. if ep.endswith('^EMX'):
  249. return ep[:-4]
  250. elif ep.endswith('^ESD'):
  251. return ep[:-4]
  252. else:
  253. return ep
  254. return None
  255. def get_shh_email_status(inv, level):
  256. if level is not None:
  257. try:
  258. level = int(level)
  259. if level <= -7:
  260. if inv:
  261. return 'low'
  262. else:
  263. return 'high'
  264. elif level <= 0:
  265. if inv:
  266. return 'low'
  267. else:
  268. return 'middle'
  269. except ValueError:
  270. return 'low'
  271. return 'low'
  272. def extract_name_from_email(email):
  273. if email and '@' in email:
  274. return email.split('@')[0][:20]
  275. return None
  276. def generate_md5_hash(input_str: str):
  277. md5_hash = hashlib.md5(input_str.encode('utf-8'))
  278. return md5_hash.hexdigest()
  279. def generate_ctc_id(tid, name, position):
  280. name = clean_contact_name(name)
  281. if not tid:
  282. return None
  283. if not name:
  284. return None
  285. if not position:
  286. input_str = f"{tid}-{name}"
  287. else:
  288. input_str = f"{tid}-{name}-{position}"
  289. return generate_md5_hash(input_str)
  290. def generate_ctc_id_fake_name(tid, name, position):
  291. name = clean_contact_name(name)
  292. if not tid:
  293. return None
  294. if not name:
  295. return None
  296. if not position:
  297. input_str = f"{tid}-{name}"
  298. else:
  299. input_str = f"{tid}-{name}-{position}"
  300. return generate_md5_hash(input_str)
  301. def clean_website(website):
  302. """
  303. 解析爬虫接口的响应,提取公司网址
  304. :param website: 爬虫接口的响应
  305. :return: 公司网址
  306. """
  307. if website and website.strip():
  308. # 去除 http://, https:// 和 www.
  309. website = re.sub(r'^(https?://)?(www\.)?', '', website)
  310. if website.endswith('/'):
  311. website = website[:-1]
  312. return website
  313. if __name__ == '__main__':
  314. cases = [
  315. 'http://aaa.com',
  316. 'https://aaa.com',
  317. 'https://www.aaa.com',
  318. 'http://www.aaa.com',
  319. 'www.aaa.com',
  320. 'www.aaa.com/asda/asda',
  321. 'www.aaa.com/asda/asda/',
  322. 'www.aaa.com/',
  323. 'https://locations.jackinthebox.com/us/wa/blaine/8140-birch-bay-square-st?utm_source=bing\u0026utm_medium=local\u0026utm_campaign=bing-local'
  324. ]
  325. for case in cases:
  326. print((case)
  327. , '->',
  328. clean_website(case))
  329. if __name__ == '__main__1':
  330. case_list = ['andy zhu',
  331. 'henry liu',
  332. 'JENS HESSELBERG LUND',
  333. ' TONY li',
  334. ' Boy. YU .',
  335. 'MARK KLINDERA @chief executive officer!'
  336. ]
  337. for case in case_list:
  338. res = clean_contact_name(case)
  339. print("{:<30} -> |{}|".format(case, res))
  340. snovio_case_list = ['unknown',
  341. 'valid',
  342. 'not_valid',
  343. 'greylisted',
  344. 'abcsd'
  345. '']
  346. shh_case_list = ['',
  347. 'abc',
  348. '.81',
  349. '1',
  350. '.89',
  351. '.92',
  352. '.97',
  353. '2',
  354. '.85',
  355. '.93',
  356. '.95',
  357. '.8',
  358. '.9',
  359. '.98',
  360. '.84',
  361. '-1'
  362. ]
  363. for case in snovio_case_list:
  364. res = clean_email_status('snovio', case)
  365. print("{:<30} -> |{}|".format(case, res))
  366. for case in shh_case_list:
  367. res = clean_email_status('shh', case)
  368. print("{:<30} -> |{}|".format(case, res))
  369. ep_case_list = ['daze@exemail.com.au^ESD',
  370. 'ub3erl33trisser@hotmail.com^ESD',
  371. 'noel.thompson@orange.net^EMX',
  372. 'Potso.Makgatho@eskom.co.za^ESD',
  373. 'dcsupplychain@yahoo.co.uk^ESD',
  374. 'sunny.patel@i2ieventsgroup.com^EMX',
  375. '_zig_@bellsouth.net^ESD',
  376. 'manish.pandey@ge.com^ESD',
  377. 'amy_salzman@comcast.com^ESD',
  378. 'kpretzer@thestrategicsolution.com^ESD']
  379. for case in ep_case_list:
  380. res = clean_shh_ep(case)
  381. print("{:<30} -> |{}|".format(case, res))
  382. print(extract_name_from_email('12345678901234567890abcdef@q.com'))
  383. print(extract_name_from_email('12345678901234567890abcdefq.com'))
  384. print(get_shh_email_status('eae', 0))