# 通用企业名称去噪 special_chars = ['.', ',', '-', '(', ')', '@', '?', '‘', '’', '“', '”', '`', '#', '+', '!', '$', '|', ':', '/', ';', '*', '《', '》', '<', '>', '%', '^', '_', '[', ']', '{', '}', '\\', '~', '=', '\'', '±', '°', '«', '»', 'µ', '¶', '·', '€', '£', '¥', '¢', '×', '÷', '¬', '…', '→', '←', '↑', '↓', '↔', '⇒', '⇐', '≈', '≠', '≤', '≥', '.', ',', '-', '(', ')', '@', '?', '"', '\'', '#', '+', '!', '$', '|', ':', '/', ';', '*', '<', '>', '%', '^', '_', '[', ']', '{', '}', '\', '~', '¨', '´', '', '¿', '‰', '¯', ] special_char_dict = {c: ' ' for c in set(special_chars)} special_char_dict['&'] = ' and ' special_char_dict['&'] = ' and ' special_chars_trans = str.maketrans(special_char_dict) head_list = ['MS ', 'M S '] tail_list = [' I PRIVATE LIMITED', ' I PRIVATELIMITED', ' PrivateATE LIMITED', ' COMPANY LIMITED', ' PRIVATE LIMITED', ' PRIVATELIMITED', ' COMPANY PRIVATE L', ' COMPANY I PRIVATE L', ' CO I PRIVATE L', ' CO PRIVATE L', ' I PRIVATE L', ' I PRIVATE', ' PRIVATE L', ' COMPANY PVT L', ' I LIMITED', ' LIMITED', ' P LTD', ' CO I LTD', ' I LTD', ' CO I PVT L', ' CO PVT L', ' PVT L', ' LTD', ' CO I', ' I PVT L', ' I PVT', ' PVT LTD', ' PVT L', ' PVT', ' PRIVATE', ' CO', ' INC', ' I'] special_tail_list = [' CO LIMITED', ' CO LTD', ' COLTD'] def sub_head(name): for head in head_list: if name.startswith(head): name = name[len(head):] break return name def sub_tail(name): for tail in special_tail_list: no_tail = f'AND{tail}' if name.endswith(tail): if name.endswith(no_tail): return name else: return name[:-len(tail)] for tail in tail_list: if name.endswith(tail): return name[:-len(tail)] return name def clean_company_name(name): if name: # 特殊字符替换为空格 name = name.translate(special_chars_trans) # 转大写,去除连续空格,去除首尾空格 name = ' '.join(name.upper().split()) return name else: return None def clean_pre_join(name): o_name = clean_company_name(name) if not o_name: return None name = sub_head(o_name) name = sub_tail(name) if len(name) < 8: return o_name return name