| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- # 通用企业名称去噪
- special_chars = ['.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '‘',
- '’',
- '“',
- '”',
- '`',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '《',
- '》',
- '<',
- '>',
- '%',
- '^',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\\',
- '~',
- '=',
- '\'',
- '±',
- '°',
- '«',
- '»',
- 'µ',
- '¶',
- '·',
- '€',
- '£',
- '¥',
- '¢',
- '×',
- '÷',
- '¬',
- '…',
- '→',
- '←',
- '↑',
- '↓',
- '↔',
- '⇒',
- '⇐',
- '≈',
- '≠',
- '≤',
- '≥',
- '.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '"',
- '\'',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '<',
- '>',
- '%',
- '^',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\',
- '~',
- '¨',
- '´',
- '',
- '¿',
- '‰',
- '¯',
- '\x1A',
- '£',
- '>',
- '¿',
- '«',
- '´',
- '»',
- '°',
- '®',
- '·',
- '¼',
- '©',
- '¶',
- "'",
- '"'
- ]
- special_char_dict = {c: ' ' for c in set(special_chars)}
- special_char_dict['&'] = ' and '
- special_char_dict['&'] = ' and '
- special_chars_trans = str.maketrans(special_char_dict)
- sub_after_list = ['O/B OF', 'B/O OF', 'O/B', 'B/O', 'BY ORDER OF', 'BY ORDER', 'ON BEHALF OF', 'ON BEHALF', 'П/П']
- sub_before_str = 'C/O'
- # 括入符列表
- same_enclosers = ['"', ''', '"', "'", ]
- diff_enclosers = ['«»', '《》']
- head_list = ['КОМПАНІЯ ', 'ООО ', 'СП ООО ', 'ТОО ', 'ТОВ ', 'ФИРМА ', 'КОМПАНИЯ ', 'ФІРМА ', 'КОМПАНИЯ ',
- 'CÔNG TY TNHH ', 'CONG TY CO PHAN ', 'ИП ООО ', 'АО ', 'M S ', 'СП ', 'JV ', 'MS ']
- def sub_head(text: str):
- if text:
- for head in head_list:
- if text.startswith(head):
- return text.replace(head, '')
- return text.strip()
- else:
- return None
- def extract_text_from_enclosers(text):
- num = 0
- result = text
- for encloser in same_enclosers:
- cnt = text.count(encloser)
- open_inx = text.find(encloser)
- close_inx = text.rfind(encloser)
- if cnt > 2:
- return text.strip()
- elif cnt == 2 and close_inx - open_inx > 1:
- num += 1
- if num > 1:
- return text.strip()
- result = text[open_inx + 1:close_inx]
- for encloser in diff_enclosers:
- open_str, close_str = encloser[0], encloser[1]
- open_cnt = text.count(open_str)
- close_cnt = text.count(close_str)
- open_inx = text.find(open_str)
- close_inx = text.rfind(close_str)
- if (open_cnt == 1 and close_cnt > 1) or (open_cnt > 1 and close_cnt == 1) or (open_cnt > 1 and close_cnt > 1):
- return text.strip()
- elif open_cnt == 1 and close_cnt == 1 and close_inx - open_inx > 1:
- num += 1
- if num > 1:
- return text.strip()
- result = text[open_inx + 1:close_inx]
- return result.strip()
- def clean_company_name(name):
- if name:
- # 特殊字符替换为空格
- name = name.translate(special_chars_trans)
- # 转大写,去除连续空格,去除首尾空格
- name = ' '.join(name.upper().split())
- return name
- else:
- return None
- def sub_start_end(main_str, sub_str):
- if main_str.startswith(sub_str):
- main_str = main_str[len(sub_str):]
- if main_str.endswith(sub_str):
- main_str = main_str[:-len(sub_str)]
- return main_str.strip()
- def get_sub_after(main_str, sub_str):
- index = main_str.find(sub_str)
- if index == -1:
- return main_str
- return main_str[index + len(sub_str):].strip()
- def get_sub_before(main_str, sub_str):
- index = main_str.find(sub_str)
- if index == -1:
- return main_str
- return main_str[:index].strip()
- def clean_pre_join(name):
- if name:
- name = name.upper().strip()
- for sub_str in sub_after_list:
- name = sub_start_end(name, sub_str)
- name = get_sub_after(name, sub_str)
- name = sub_start_end(name, sub_before_str)
- name = get_sub_before(name, sub_before_str)
- name = extract_text_from_enclosers(name)
- name = clean_company_name(name)
- name = sub_head(name)
- return name
- return None
- if __name__ == '__main__':
- print(clean_pre_join('ASF INC ON BEH¿ BY ORDER OF'))
- if __name__ == '__main__2':
- input_str1 = 'a<b>c'
- input_str2 = 'a<b>c<d>e<f>gh'
- input_str3 = 'a<"x>"b'
- input_str4 = 'This <is a test <example> string.'
- input_str5 = 'This is a test «aaa» string.'
- case_list = [input_str1, input_str2, input_str3, input_str4, input_str5]
- case_list.append('sss"adsd"ddd')
- case_list.append('This is a test ""aaa» string.')
- case_list.append('a<"x">b ')
- case_list.append('""abcd')
- case_list.append('a>bc<d')
- case_list.append('abcd<>')
- case_list.append('abcd<bbbb》b>')
- case_list.append('abcd<b'b“bb》b>')
- for case in case_list:
- extract_text = extract_text_from_enclosers(case)
- print("{:<50} -> {}".format(case, extract_text))
- if __name__ == '__main__1':
- case1 = ' AB cde .((!) '
- assert clean_company_name(case1) == 'AB CDE'
- case2 = None
- assert clean_company_name(case2) is None
- case3 = ' '
- assert clean_company_name(case3) == ''
- case4 = '~ab#c≥'
- assert clean_company_name(case4) == 'AB C'
- case5 = '÷ & ! '
- assert clean_company_name(case5) == 'AND'
- case6 = 'abc&def'
- assert clean_company_name(case6) == 'ABC AND DEF'
- case = 'abc&def'
- assert clean_company_name(case6) == 'ABC AND DEF'
- print('all test cases passed')
|