tianyu.chu
/
poyee-data-warehouse


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
							import hashlib
import json
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import *

special_chars = ['.',
                 ',',
                 '-',
                 '(',
                 ')',
                 '@',
                 '?',
                 '‘',
                 '’',
                 '“',
                 '”',
                 '`',
                 '#',
                 '+',
                 '!',
                 '$',
                 '|',
                 ':',
                 '/',
                 ';',
                 '*',
                 '《',
                 '》',
                 '<',
                 '>',
                 '`',
                 '#',
                 '+',
                 '!',
                 '$',
                 '|',
                 ':',
                 '/',
                 ';',
                 '*',
                 '《',
                 '》',
                 '<',
                 '>',
                 '%',
                 '^',
                 '&',
                 '_',
                 '[',
                 ']',
                 '{',
                 '}',
                 '\\',
                 '~',
                 '＝',
                 "'",
                 '±',
                 '°',
                 '«',
                 '»',
                 'µ',
                 '¶',
                 '·',
                 '€',
                 '£',
                 '¥',
                 '¢',
                 '×',
                 '÷',
                 '±',
                 '¬',
                 '…',
                 '→',
                 '←',
                 '↑',
                 '↓',
                 '↔',
                 '⇒',
                 '⇐',
                 '≈',
                 '≠',
                 '≤',
                 '≥',
                 '¨',
                 '´',
                 '．',
                 '，',
                 '－',
                 '（',
                 '）',
                 '＠',
                 '？',
                 "'",
                 "'",
                 '"',
                 '"',
                 '＇',
                 '＃',
                 '＋',
                 '！',
                 '＄',
                 '｜',
                 '：',
                 '／',
                 '；',
                 '＊',
                 '',
                 '',
                 '＜',
                 '＞',
                 "'",
                 '＃',
                 '＋',
                 '！',
                 '＄',
                 '｜',
                 '：',
                 '／',
                 '；',
                 '＊',
                 '',
                 '',
                 '＜',
                 '＞',
                 '％',
                 '＾',
                 '＆',
                 '＿',
                 '［',
                 '］',
                 '｛',
                 '｝',
                 '＼',
                 '～',
                 '＝',
                 "'",
                 '±',
                 '°',
                 '«',
                 '»',
                 'µ',
                 '¶',
                 '·',
                 '€',
                 '£',
                 '¥',
                 '¢',
                 '×',
                 '÷',
                 '±',
                 '¬',
                 '…',
                 '→',
                 '←',
                 '↑',
                 '↓',
                 '↔',
                 '⇒',
                 '⇐',
                 '≈',
                 '≠',
                 '≤',
                 '≥']
special_chars = set(special_chars)


@udf(returnType=ArrayType(StringType()))
def str_to_json_arr(json_str: str) -> list:
    try:
        if json_str:
            res = []
            for j in json.loads(json_str):
                res.append(json.dumps(j, ensure_ascii=False))
            return res
    except json.JSONDecodeError as e:
        # 处理JSON解析错误
        print(f"JSONDecodeError: {e}")
    except Exception as e:
        # 处理其他异常
        print(f"Unexpected error: {e}")
    return []


@udf(returnType=ArrayType(StringType()))
def str_to_arr(json_str: str) -> list:
    try:
        if json_str:
            return json.loads(json_str)
    except json.JSONDecodeError as e:
        # 处理JSON解析错误
        print(f"JSONDecodeError: {e}")
    except Exception as e:
        # 处理其他异常
        print(f"Unexpected error: {e}")
    return []


@udf(returnType=ArrayType(MapType(StringType(), StringType())))
def str_to_map_arr(json_str: str) -> list:
    try:
        if json_str:
            return json.loads(json_str)
        return []
    except json.JSONDecodeError as e:
        # Handle JSON decoding error
        print(f"JSONDecodeError: {e}")
        return []
    except Exception as e:
        # Handle other exceptions
        print(f"Unexpected error: {e}")
        return []


def merge_ws(text: str):
    if text:
        return ' '.join(text.split())
    return None


def uppercase_first_letter(word):
    word = word.lower()
    return word[:1].upper() + word[1:]


def remove_special_chars(word):
    return ''.join(ch for ch in word if ch not in special_chars)


def clean_contact_name(contact_name):
    if contact_name:
        names = contact_name.split()
        cleaned_names = [remove_special_chars(name) for name in names]
        upper_names = [uppercase_first_letter(name) for name in cleaned_names]
        cleaned_names = ' '.join(upper_names)
        return ' '.join(cleaned_names.split())
    return None


def clean_email_status(source, match_level):
    if match_level:
        if source == 'shh':
            try:
                match_level = float(match_level)
                if match_level == 1:
                    return 'PERFECT_MATCH'
                elif match_level in (2, -1):
                    return 'SPECULATION_VERIFICATION'
                elif match_level >= 0.9 and match_level < 1:
                    return 'POSSIBLE_MATCH'
                else:
                    return 'LOW_MATCH'
            except ValueError:
                return None
        elif source == 'snovio':
            if match_level in ('valid', 'verified'):
                return 'PERFECT_MATCH'
            elif match_level in ('not_valid', 'greylisted', 'notVerified'):
                return 'SPECULATION_VERIFICATION'
            else:
                return 'LOW_MATCH'
    return None


def clean_shh_ep(ep):
    if ep:
        if ep.endswith('^EMX'):
            return ep[:-4]
        elif ep.endswith('^ESD'):
            return ep[:-4]
        else:
            return ep
    return None


def get_shh_email_status(inv, level):
    if level is not None:
        try:
            level = int(level)
            if level <= -7:
                if inv:
                    return 'low'
                else:
                    return 'high'
            elif level <= 0:
                if inv:
                    return 'low'
                else:
                    return 'middle'
        except ValueError:
            return 'low'
    return 'low'


def extract_name_from_email(email):
    if email and '@' in email:
        return email.split('@')[0][:20]
    return None


def generate_md5_hash(input_str: str):
    md5_hash = hashlib.md5(input_str.encode('utf-8'))
    return md5_hash.hexdigest()


def generate_ctc_id(tid, name, position):
    name = clean_contact_name(name)
    if not tid:
        return None
    if not name:
        return None
    if not position:
        input_str = f"{tid}-{name}"
    else:
        input_str = f"{tid}-{name}-{position}"
    return generate_md5_hash(input_str)


def generate_ctc_id_fake_name(tid, name, position):
    name = clean_contact_name(name)
    if not tid:
        return None
    if not name:
        return None
    if not position:
        input_str = f"{tid}-{name}"
    else:
        input_str = f"{tid}-{name}-{position}"
    return generate_md5_hash(input_str)


def clean_website(website):
    """
    解析爬虫接口的响应，提取公司网址

    :param website: 爬虫接口的响应
    :return: 公司网址
    """
    if website and website.strip():
        # 去除 http://, https:// 和 www.
        website = re.sub(r'^(https?://)?(www\.)?', '', website)
        if website.endswith('/'):
            website = website[:-1]
    return website


if __name__ == '__main__':
    cases = [
        'http://aaa.com',
        'https://aaa.com',
        'https://www.aaa.com',
        'http://www.aaa.com',
        'www.aaa.com',
        'www.aaa.com/asda/asda',
        'www.aaa.com/asda/asda/',
        'www.aaa.com/',
        'https://locations.jackinthebox.com/us/wa/blaine/8140-birch-bay-square-st?utm_source=bing\u0026utm_medium=local\u0026utm_campaign=bing-local'
    ]
    for case in cases:
        print((case)
              , '->',
              clean_website(case))

if __name__ == '__main__1':
    case_list = ['andy zhu',
                 'henry    liu',
                 'JENS HESSELBERG LUND',
                 '  TONY   li',
                 ' Boy.  YU  .',
                 'MARK KLINDERA @chief executive officer!'
                 ]
    for case in case_list:
        res = clean_contact_name(case)
        print("{:<30} ->  |{}|".format(case, res))
    snovio_case_list = ['unknown',
                        'valid',
                        'not_valid',
                        'greylisted',
                        'abcsd'
                        '']
    shh_case_list = ['',
                     'abc',
                     '.81',
                     '1',
                     '.89',
                     '.92',
                     '.97',
                     '2',
                     '.85',
                     '.93',
                     '.95',
                     '.8',
                     '.9',
                     '.98',
                     '.84',
                     '-1'
                     ]
    for case in snovio_case_list:
        res = clean_email_status('snovio', case)
        print("{:<30} ->  |{}|".format(case, res))
    for case in shh_case_list:
        res = clean_email_status('shh', case)
        print("{:<30} ->  |{}|".format(case, res))
    ep_case_list = ['daze@exemail.com.au^ESD',
                    'ub3erl33trisser@hotmail.com^ESD',
                    'noel.thompson@orange.net^EMX',
                    'Potso.Makgatho@eskom.co.za^ESD',
                    'dcsupplychain@yahoo.co.uk^ESD',
                    'sunny.patel@i2ieventsgroup.com^EMX',
                    '_zig_@bellsouth.net^ESD',
                    'manish.pandey@ge.com^ESD',
                    'amy_salzman@comcast.com^ESD',
                    'kpretzer@thestrategicsolution.com^ESD']
    for case in ep_case_list:
        res = clean_shh_ep(case)
        print("{:<30} ->  |{}|".format(case, res))
    print(extract_name_from_email('12345678901234567890abcdef@q.com'))
    print(extract_name_from_email('12345678901234567890abcdefq.com'))
    print(get_shh_email_status('eae', 0))