import hashlib import json import re from pyspark.sql.functions import udf from pyspark.sql.types import * special_chars = ['.', ',', '-', '(', ')', '@', '?', '‘', '’', '“', '”', '`', '#', '+', '!', '$', '|', ':', '/', ';', '*', '《', '》', '<', '>', '`', '#', '+', '!', '$', '|', ':', '/', ';', '*', '《', '》', '<', '>', '%', '^', '&', '_', '[', ']', '{', '}', '\\', '~', '=', "'", '±', '°', '«', '»', 'µ', '¶', '·', '€', '£', '¥', '¢', '×', '÷', '±', '¬', '…', '→', '←', '↑', '↓', '↔', '⇒', '⇐', '≈', '≠', '≤', '≥', '¨', '´', '.', ',', '-', '(', ')', '@', '?', "'", "'", '"', '"', ''', '#', '+', '!', '$', '|', ':', '/', ';', '*', '', '', '<', '>', "'", '#', '+', '!', '$', '|', ':', '/', ';', '*', '', '', '<', '>', '%', '^', '&', '_', '[', ']', '{', '}', '\', '~', '=', "'", '±', '°', '«', '»', 'µ', '¶', '·', '€', '£', '¥', '¢', '×', '÷', '±', '¬', '…', '→', '←', '↑', '↓', '↔', '⇒', '⇐', '≈', '≠', '≤', '≥'] special_chars = set(special_chars) @udf(returnType=ArrayType(StringType())) def str_to_json_arr(json_str: str) -> list: try: if json_str: res = [] for j in json.loads(json_str): res.append(json.dumps(j, ensure_ascii=False)) return res except json.JSONDecodeError as e: # 处理JSON解析错误 print(f"JSONDecodeError: {e}") except Exception as e: # 处理其他异常 print(f"Unexpected error: {e}") return [] @udf(returnType=ArrayType(StringType())) def str_to_arr(json_str: str) -> list: try: if json_str: return json.loads(json_str) except json.JSONDecodeError as e: # 处理JSON解析错误 print(f"JSONDecodeError: {e}") except Exception as e: # 处理其他异常 print(f"Unexpected error: {e}") return [] @udf(returnType=ArrayType(MapType(StringType(), StringType()))) def str_to_map_arr(json_str: str) -> list: try: if json_str: return json.loads(json_str) return [] except json.JSONDecodeError as e: # Handle JSON decoding error print(f"JSONDecodeError: {e}") return [] except Exception as e: # Handle other exceptions print(f"Unexpected error: {e}") return [] def merge_ws(text: str): if text: return ' '.join(text.split()) return None def uppercase_first_letter(word): word = word.lower() return word[:1].upper() + word[1:] def remove_special_chars(word): return ''.join(ch for ch in word if ch not in special_chars) def clean_contact_name(contact_name): if contact_name: names = contact_name.split() cleaned_names = [remove_special_chars(name) for name in names] upper_names = [uppercase_first_letter(name) for name in cleaned_names] cleaned_names = ' '.join(upper_names) return ' '.join(cleaned_names.split()) return None def clean_email_status(source, match_level): if match_level: if source == 'shh': try: match_level = float(match_level) if match_level == 1: return 'PERFECT_MATCH' elif match_level in (2, -1): return 'SPECULATION_VERIFICATION' elif match_level >= 0.9 and match_level < 1: return 'POSSIBLE_MATCH' else: return 'LOW_MATCH' except ValueError: return None elif source == 'snovio': if match_level in ('valid', 'verified'): return 'PERFECT_MATCH' elif match_level in ('not_valid', 'greylisted', 'notVerified'): return 'SPECULATION_VERIFICATION' else: return 'LOW_MATCH' return None def clean_shh_ep(ep): if ep: if ep.endswith('^EMX'): return ep[:-4] elif ep.endswith('^ESD'): return ep[:-4] else: return ep return None def get_shh_email_status(inv, level): if level is not None: try: level = int(level) if level <= -7: if inv: return 'low' else: return 'high' elif level <= 0: if inv: return 'low' else: return 'middle' except ValueError: return 'low' return 'low' def extract_name_from_email(email): if email and '@' in email: return email.split('@')[0][:20] return None def generate_md5_hash(input_str: str): md5_hash = hashlib.md5(input_str.encode('utf-8')) return md5_hash.hexdigest() def generate_ctc_id(tid, name, position): name = clean_contact_name(name) if not tid: return None if not name: return None if not position: input_str = f"{tid}-{name}" else: input_str = f"{tid}-{name}-{position}" return generate_md5_hash(input_str) def generate_ctc_id_fake_name(tid, name, position): name = clean_contact_name(name) if not tid: return None if not name: return None if not position: input_str = f"{tid}-{name}" else: input_str = f"{tid}-{name}-{position}" return generate_md5_hash(input_str) def clean_website(website): """ 解析爬虫接口的响应,提取公司网址 :param website: 爬虫接口的响应 :return: 公司网址 """ if website and website.strip(): # 去除 http://, https:// 和 www. website = re.sub(r'^(https?://)?(www\.)?', '', website) if website.endswith('/'): website = website[:-1] return website if __name__ == '__main__': cases = [ 'http://aaa.com', 'https://aaa.com', 'https://www.aaa.com', 'http://www.aaa.com', 'www.aaa.com', 'www.aaa.com/asda/asda', 'www.aaa.com/asda/asda/', 'www.aaa.com/', 'https://locations.jackinthebox.com/us/wa/blaine/8140-birch-bay-square-st?utm_source=bing\u0026utm_medium=local\u0026utm_campaign=bing-local' ] for case in cases: print((case) , '->', clean_website(case)) if __name__ == '__main__1': case_list = ['andy zhu', 'henry liu', 'JENS HESSELBERG LUND', ' TONY li', ' Boy. YU .', 'MARK KLINDERA @chief executive officer!' ] for case in case_list: res = clean_contact_name(case) print("{:<30} -> |{}|".format(case, res)) snovio_case_list = ['unknown', 'valid', 'not_valid', 'greylisted', 'abcsd' ''] shh_case_list = ['', 'abc', '.81', '1', '.89', '.92', '.97', '2', '.85', '.93', '.95', '.8', '.9', '.98', '.84', '-1' ] for case in snovio_case_list: res = clean_email_status('snovio', case) print("{:<30} -> |{}|".format(case, res)) for case in shh_case_list: res = clean_email_status('shh', case) print("{:<30} -> |{}|".format(case, res)) ep_case_list = ['daze@exemail.com.au^ESD', 'ub3erl33trisser@hotmail.com^ESD', 'noel.thompson@orange.net^EMX', 'Potso.Makgatho@eskom.co.za^ESD', 'dcsupplychain@yahoo.co.uk^ESD', 'sunny.patel@i2ieventsgroup.com^EMX', '_zig_@bellsouth.net^ESD', 'manish.pandey@ge.com^ESD', 'amy_salzman@comcast.com^ESD', 'kpretzer@thestrategicsolution.com^ESD'] for case in ep_case_list: res = clean_shh_ep(case) print("{:<30} -> |{}|".format(case, res)) print(extract_name_from_email('12345678901234567890abcdef@q.com')) print(extract_name_from_email('12345678901234567890abcdefq.com')) print(get_shh_email_status('eae', 0))