# 通用企业名称去噪 special_chars = ['.', ',', '-', '(', ')', '@', '?', '‘', '’', '“', '”', '`', '#', '+', '!', '$', '|', ':', '/', ';', '*', '《', '》', '<', '>', '%', '^', '_', '[', ']', '{', '}', '\\', '~', '=', '\'', '±', '°', '«', '»', 'µ', '¶', '·', '€', '£', '¥', '¢', '×', '÷', '¬', '…', '→', '←', '↑', '↓', '↔', '⇒', '⇐', '≈', '≠', '≤', '≥', '.', ',', '-', '(', ')', '@', '?', '"', '\'', '#', '+', '!', '$', '|', ':', '/', ';', '*', '<', '>', '%', '^', '_', '[', ']', '{', '}', '\', '~', '¨', '´', '', '¿', '‰', '¯', '\x1A', '£', '>', '¿', '«', '´', '»', '°', '®', '·', '¼', '©', '¶', "'", '"' ] special_char_dict = {c: ' ' for c in set(special_chars)} special_char_dict['&'] = ' and ' special_char_dict['&'] = ' and ' special_chars_trans = str.maketrans(special_char_dict) sub_after_list = ['O/B OF', 'B/O OF', 'O/B', 'B/O', 'BY ORDER OF', 'BY ORDER', 'ON BEHALF OF', 'ON BEHALF', 'П/П'] sub_before_str = 'C/O' # 括入符列表 same_enclosers = ['"', ''', '"', "'", ] diff_enclosers = ['«»', '《》'] head_list = ['КОМПАНІЯ ', 'ООО ', 'СП ООО ', 'ТОО ', 'ТОВ ', 'ФИРМА ', 'КОМПАНИЯ ', 'ФІРМА ', 'КОМПАНИЯ ', 'CÔNG TY TNHH ', 'CONG TY CO PHAN ', 'ИП ООО ', 'АО ', 'M S ', 'СП ', 'JV ', 'MS '] def sub_head(text: str): if text: for head in head_list: if text.startswith(head): return text.replace(head, '') return text.strip() else: return None def extract_text_from_enclosers(text): num = 0 result = text for encloser in same_enclosers: cnt = text.count(encloser) open_inx = text.find(encloser) close_inx = text.rfind(encloser) if cnt > 2: return text.strip() elif cnt == 2 and close_inx - open_inx > 1: num += 1 if num > 1: return text.strip() result = text[open_inx + 1:close_inx] for encloser in diff_enclosers: open_str, close_str = encloser[0], encloser[1] open_cnt = text.count(open_str) close_cnt = text.count(close_str) open_inx = text.find(open_str) close_inx = text.rfind(close_str) if (open_cnt == 1 and close_cnt > 1) or (open_cnt > 1 and close_cnt == 1) or (open_cnt > 1 and close_cnt > 1): return text.strip() elif open_cnt == 1 and close_cnt == 1 and close_inx - open_inx > 1: num += 1 if num > 1: return text.strip() result = text[open_inx + 1:close_inx] return result.strip() def clean_company_name(name): if name: # 特殊字符替换为空格 name = name.translate(special_chars_trans) # 转大写,去除连续空格,去除首尾空格 name = ' '.join(name.upper().split()) return name else: return None def sub_start_end(main_str, sub_str): if main_str.startswith(sub_str): main_str = main_str[len(sub_str):] if main_str.endswith(sub_str): main_str = main_str[:-len(sub_str)] return main_str.strip() def get_sub_after(main_str, sub_str): index = main_str.find(sub_str) if index == -1: return main_str return main_str[index + len(sub_str):].strip() def get_sub_before(main_str, sub_str): index = main_str.find(sub_str) if index == -1: return main_str return main_str[:index].strip() def clean_pre_join(name): if name: name = name.upper().strip() for sub_str in sub_after_list: name = sub_start_end(name, sub_str) name = get_sub_after(name, sub_str) name = sub_start_end(name, sub_before_str) name = get_sub_before(name, sub_before_str) name = extract_text_from_enclosers(name) name = clean_company_name(name) name = sub_head(name) return name return None if __name__ == '__main__': print(clean_pre_join('ASF INC ON BEH¿ BY ORDER OF')) if __name__ == '__main__2': input_str1 = 'ac' input_str2 = 'acegh' input_str3 = 'a<"x>"b' input_str4 = 'This string.' input_str5 = 'This is a test «aaa» string.' case_list = [input_str1, input_str2, input_str3, input_str4, input_str5] case_list.append('sss"adsd"ddd') case_list.append('This is a test ""aaa» string.') case_list.append('a<"x">b ') case_list.append('""abcd') case_list.append('a>bc') case_list.append('abcd') case_list.append('abcd') for case in case_list: extract_text = extract_text_from_enclosers(case) print("{:<50} -> {}".format(case, extract_text)) if __name__ == '__main__1': case1 = ' AB cde .((!) ' assert clean_company_name(case1) == 'AB CDE' case2 = None assert clean_company_name(case2) is None case3 = ' ' assert clean_company_name(case3) == '' case4 = '~ab#c≥' assert clean_company_name(case4) == 'AB C' case5 = '÷ & ! ' assert clean_company_name(case5) == 'AND' case6 = 'abc&def' assert clean_company_name(case6) == 'ABC AND DEF' case = 'abc&def' assert clean_company_name(case6) == 'ABC AND DEF' print('all test cases passed')