| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188 |
- # 通用企业名称去噪
- special_chars = ['.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '‘',
- '’',
- '“',
- '”',
- '`',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '《',
- '》',
- '<',
- '>',
- '%',
- '^',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\\',
- '~',
- '=',
- '\'',
- '±',
- '°',
- '«',
- '»',
- 'µ',
- '¶',
- '·',
- '€',
- '£',
- '¥',
- '¢',
- '×',
- '÷',
- '¬',
- '…',
- '→',
- '←',
- '↑',
- '↓',
- '↔',
- '⇒',
- '⇐',
- '≈',
- '≠',
- '≤',
- '≥',
- '.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '"',
- '\'',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '<',
- '>',
- '%',
- '^',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\',
- '~',
- '¨',
- '´',
- '',
- '¿',
- '‰',
- '¯',
- ]
- special_char_dict = {c: ' ' for c in set(special_chars)}
- special_char_dict['&'] = ' and '
- special_char_dict['&'] = ' and '
- special_chars_trans = str.maketrans(special_char_dict)
- head_list = ['MS ', 'M S ']
- tail_list = [' I PRIVATE LIMITED',
- ' I PRIVATELIMITED',
- ' PrivateATE LIMITED',
- ' COMPANY LIMITED',
- ' PRIVATE LIMITED',
- ' PRIVATELIMITED',
- ' COMPANY PRIVATE L',
- ' COMPANY I PRIVATE L',
- ' CO I PRIVATE L',
- ' CO PRIVATE L',
- ' I PRIVATE L',
- ' I PRIVATE',
- ' PRIVATE L',
- ' COMPANY PVT L',
- ' I LIMITED',
- ' LIMITED',
- ' P LTD',
- ' CO I LTD',
- ' I LTD',
- ' CO I PVT L',
- ' CO PVT L',
- ' PVT L',
- ' LTD',
- ' CO I',
- ' I PVT L',
- ' I PVT',
- ' PVT LTD',
- ' PVT L',
- ' PVT',
- ' PRIVATE',
- ' CO',
- ' INC',
- ' I']
- special_tail_list = [' CO LIMITED',
- ' CO LTD',
- ' COLTD']
- def sub_head(name):
- for head in head_list:
- if name.startswith(head):
- name = name[len(head):]
- break
- return name
- def sub_tail(name):
- for tail in special_tail_list:
- no_tail = f'AND{tail}'
- if name.endswith(tail):
- if name.endswith(no_tail):
- return name
- else:
- return name[:-len(tail)]
- for tail in tail_list:
- if name.endswith(tail):
- return name[:-len(tail)]
- return name
- def clean_company_name(name):
- if name:
- # 特殊字符替换为空格
- name = name.translate(special_chars_trans)
- # 转大写,去除连续空格,去除首尾空格
- name = ' '.join(name.upper().split())
- return name
- else:
- return None
- def clean_pre_join(name):
- o_name = clean_company_name(name)
- if not o_name:
- return None
- name = sub_head(o_name)
- name = sub_tail(name)
- if len(name) < 8:
- return o_name
- return name
|