| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- import hashlib
- import json
- import re
- from pyspark.sql.functions import udf
- from pyspark.sql.types import *
- special_chars = ['.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '‘',
- '’',
- '“',
- '”',
- '`',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '《',
- '》',
- '<',
- '>',
- '`',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '《',
- '》',
- '<',
- '>',
- '%',
- '^',
- '&',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\\',
- '~',
- '=',
- "'",
- '±',
- '°',
- '«',
- '»',
- 'µ',
- '¶',
- '·',
- '€',
- '£',
- '¥',
- '¢',
- '×',
- '÷',
- '±',
- '¬',
- '…',
- '→',
- '←',
- '↑',
- '↓',
- '↔',
- '⇒',
- '⇐',
- '≈',
- '≠',
- '≤',
- '≥',
- '¨',
- '´',
- '.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- "'",
- "'",
- '"',
- '"',
- ''',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '',
- '',
- '<',
- '>',
- "'",
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '',
- '',
- '<',
- '>',
- '%',
- '^',
- '&',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\',
- '~',
- '=',
- "'",
- '±',
- '°',
- '«',
- '»',
- 'µ',
- '¶',
- '·',
- '€',
- '£',
- '¥',
- '¢',
- '×',
- '÷',
- '±',
- '¬',
- '…',
- '→',
- '←',
- '↑',
- '↓',
- '↔',
- '⇒',
- '⇐',
- '≈',
- '≠',
- '≤',
- '≥']
- special_chars = set(special_chars)
- @udf(returnType=ArrayType(StringType()))
- def str_to_json_arr(json_str: str) -> list:
- try:
- if json_str:
- res = []
- for j in json.loads(json_str):
- res.append(json.dumps(j, ensure_ascii=False))
- return res
- except json.JSONDecodeError as e:
- # 处理JSON解析错误
- print(f"JSONDecodeError: {e}")
- except Exception as e:
- # 处理其他异常
- print(f"Unexpected error: {e}")
- return []
- @udf(returnType=ArrayType(StringType()))
- def str_to_arr(json_str: str) -> list:
- try:
- if json_str:
- return json.loads(json_str)
- except json.JSONDecodeError as e:
- # 处理JSON解析错误
- print(f"JSONDecodeError: {e}")
- except Exception as e:
- # 处理其他异常
- print(f"Unexpected error: {e}")
- return []
- @udf(returnType=ArrayType(MapType(StringType(), StringType())))
- def str_to_map_arr(json_str: str) -> list:
- try:
- if json_str:
- return json.loads(json_str)
- return []
- except json.JSONDecodeError as e:
- # Handle JSON decoding error
- print(f"JSONDecodeError: {e}")
- return []
- except Exception as e:
- # Handle other exceptions
- print(f"Unexpected error: {e}")
- return []
- def merge_ws(text: str):
- if text:
- return ' '.join(text.split())
- return None
- def uppercase_first_letter(word):
- word = word.lower()
- return word[:1].upper() + word[1:]
- def remove_special_chars(word):
- return ''.join(ch for ch in word if ch not in special_chars)
- def clean_contact_name(contact_name):
- if contact_name:
- names = contact_name.split()
- cleaned_names = [remove_special_chars(name) for name in names]
- upper_names = [uppercase_first_letter(name) for name in cleaned_names]
- cleaned_names = ' '.join(upper_names)
- return ' '.join(cleaned_names.split())
- return None
- def clean_email_status(source, match_level):
- if match_level:
- if source == 'shh':
- try:
- match_level = float(match_level)
- if match_level == 1:
- return 'PERFECT_MATCH'
- elif match_level in (2, -1):
- return 'SPECULATION_VERIFICATION'
- elif match_level >= 0.9 and match_level < 1:
- return 'POSSIBLE_MATCH'
- else:
- return 'LOW_MATCH'
- except ValueError:
- return None
- elif source == 'snovio':
- if match_level in ('valid', 'verified'):
- return 'PERFECT_MATCH'
- elif match_level in ('not_valid', 'greylisted', 'notVerified'):
- return 'SPECULATION_VERIFICATION'
- else:
- return 'LOW_MATCH'
- return None
- def clean_shh_ep(ep):
- if ep:
- if ep.endswith('^EMX'):
- return ep[:-4]
- elif ep.endswith('^ESD'):
- return ep[:-4]
- else:
- return ep
- return None
- def get_shh_email_status(inv, level):
- if level is not None:
- try:
- level = int(level)
- if level <= -7:
- if inv:
- return 'low'
- else:
- return 'high'
- elif level <= 0:
- if inv:
- return 'low'
- else:
- return 'middle'
- except ValueError:
- return 'low'
- return 'low'
- def extract_name_from_email(email):
- if email and '@' in email:
- return email.split('@')[0][:20]
- return None
- def generate_md5_hash(input_str: str):
- md5_hash = hashlib.md5(input_str.encode('utf-8'))
- return md5_hash.hexdigest()
- def generate_ctc_id(tid, name, position):
- name = clean_contact_name(name)
- if not tid:
- return None
- if not name:
- return None
- if not position:
- input_str = f"{tid}-{name}"
- else:
- input_str = f"{tid}-{name}-{position}"
- return generate_md5_hash(input_str)
- def generate_ctc_id_fake_name(tid, name, position):
- name = clean_contact_name(name)
- if not tid:
- return None
- if not name:
- return None
- if not position:
- input_str = f"{tid}-{name}"
- else:
- input_str = f"{tid}-{name}-{position}"
- return generate_md5_hash(input_str)
- def clean_website(website):
- """
- 解析爬虫接口的响应,提取公司网址
- :param website: 爬虫接口的响应
- :return: 公司网址
- """
- if website and website.strip():
- # 去除 http://, https:// 和 www.
- website = re.sub(r'^(https?://)?(www\.)?', '', website)
- if website.endswith('/'):
- website = website[:-1]
- return website
- if __name__ == '__main__':
- cases = [
- 'http://aaa.com',
- 'https://aaa.com',
- 'https://www.aaa.com',
- 'http://www.aaa.com',
- 'www.aaa.com',
- 'www.aaa.com/asda/asda',
- 'www.aaa.com/asda/asda/',
- 'www.aaa.com/',
- 'https://locations.jackinthebox.com/us/wa/blaine/8140-birch-bay-square-st?utm_source=bing\u0026utm_medium=local\u0026utm_campaign=bing-local'
- ]
- for case in cases:
- print((case)
- , '->',
- clean_website(case))
- if __name__ == '__main__1':
- case_list = ['andy zhu',
- 'henry liu',
- 'JENS HESSELBERG LUND',
- ' TONY li',
- ' Boy. YU .',
- 'MARK KLINDERA @chief executive officer!'
- ]
- for case in case_list:
- res = clean_contact_name(case)
- print("{:<30} -> |{}|".format(case, res))
- snovio_case_list = ['unknown',
- 'valid',
- 'not_valid',
- 'greylisted',
- 'abcsd'
- '']
- shh_case_list = ['',
- 'abc',
- '.81',
- '1',
- '.89',
- '.92',
- '.97',
- '2',
- '.85',
- '.93',
- '.95',
- '.8',
- '.9',
- '.98',
- '.84',
- '-1'
- ]
- for case in snovio_case_list:
- res = clean_email_status('snovio', case)
- print("{:<30} -> |{}|".format(case, res))
- for case in shh_case_list:
- res = clean_email_status('shh', case)
- print("{:<30} -> |{}|".format(case, res))
- ep_case_list = ['daze@exemail.com.au^ESD',
- 'ub3erl33trisser@hotmail.com^ESD',
- 'noel.thompson@orange.net^EMX',
- 'Potso.Makgatho@eskom.co.za^ESD',
- 'dcsupplychain@yahoo.co.uk^ESD',
- 'sunny.patel@i2ieventsgroup.com^EMX',
- '_zig_@bellsouth.net^ESD',
- 'manish.pandey@ge.com^ESD',
- 'amy_salzman@comcast.com^ESD',
- 'kpretzer@thestrategicsolution.com^ESD']
- for case in ep_case_list:
- res = clean_shh_ep(case)
- print("{:<30} -> |{}|".format(case, res))
- print(extract_name_from_email('12345678901234567890abcdef@q.com'))
- print(extract_name_from_email('12345678901234567890abcdefq.com'))
- print(get_shh_email_status('eae', 0))
|