common_clean.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # 通用企业名称去噪
  2. special_chars = ['.',
  3. ',',
  4. '-',
  5. '(',
  6. ')',
  7. '@',
  8. '?',
  9. '‘',
  10. '’',
  11. '“',
  12. '”',
  13. '`',
  14. '#',
  15. '+',
  16. '!',
  17. '$',
  18. '|',
  19. ':',
  20. '/',
  21. ';',
  22. '*',
  23. '《',
  24. '》',
  25. '<',
  26. '>',
  27. '%',
  28. '^',
  29. '_',
  30. '[',
  31. ']',
  32. '{',
  33. '}',
  34. '\\',
  35. '~',
  36. '=',
  37. '\'',
  38. '±',
  39. '°',
  40. '«',
  41. '»',
  42. 'µ',
  43. '¶',
  44. '·',
  45. '€',
  46. '£',
  47. '¥',
  48. '¢',
  49. '×',
  50. '÷',
  51. '¬',
  52. '…',
  53. '→',
  54. '←',
  55. '↑',
  56. '↓',
  57. '↔',
  58. '⇒',
  59. '⇐',
  60. '≈',
  61. '≠',
  62. '≤',
  63. '≥',
  64. '.',
  65. ',',
  66. '-',
  67. '(',
  68. ')',
  69. '@',
  70. '?',
  71. '"',
  72. '\'',
  73. '#',
  74. '+',
  75. '!',
  76. '$',
  77. '|',
  78. ':',
  79. '/',
  80. ';',
  81. '*',
  82. '<',
  83. '>',
  84. '%',
  85. '^',
  86. '_',
  87. '[',
  88. ']',
  89. '{',
  90. '}',
  91. '\',
  92. '~',
  93. '¨',
  94. '´',
  95. '',
  96. '¿',
  97. '‰',
  98. '¯',
  99. '\x1A',
  100. '£',
  101. '>',
  102. '¿',
  103. '«',
  104. '´',
  105. '»',
  106. '°',
  107. '®',
  108. '·',
  109. '¼',
  110. '©',
  111. '¶',
  112. "'",
  113. '"'
  114. ]
  115. special_char_dict = {c: ' ' for c in set(special_chars)}
  116. special_char_dict['&'] = ' and '
  117. special_char_dict['&'] = ' and '
  118. special_chars_trans = str.maketrans(special_char_dict)
  119. sub_after_list = ['O/B OF', 'B/O OF', 'O/B', 'B/O', 'BY ORDER OF', 'BY ORDER', 'ON BEHALF OF', 'ON BEHALF', 'П/П']
  120. sub_before_str = 'C/O'
  121. # 括入符列表
  122. same_enclosers = ['"', ''', '"', "'", ]
  123. diff_enclosers = ['«»', '《》']
  124. head_list = ['КОМПАНІЯ ', 'ООО ', 'СП ООО ', 'ТОО ', 'ТОВ ', 'ФИРМА ', 'КОМПАНИЯ ', 'ФІРМА ', 'КОМПАНИЯ ',
  125. 'CÔNG TY TNHH ', 'CONG TY CO PHAN ', 'ИП ООО ', 'АО ', 'M S ', 'СП ', 'JV ', 'MS ']
  126. def sub_head(text: str):
  127. if text:
  128. for head in head_list:
  129. if text.startswith(head):
  130. return text.replace(head, '')
  131. return text.strip()
  132. else:
  133. return None
  134. def extract_text_from_enclosers(text):
  135. num = 0
  136. result = text
  137. for encloser in same_enclosers:
  138. cnt = text.count(encloser)
  139. open_inx = text.find(encloser)
  140. close_inx = text.rfind(encloser)
  141. if cnt > 2:
  142. return text.strip()
  143. elif cnt == 2 and close_inx - open_inx > 1:
  144. num += 1
  145. if num > 1:
  146. return text.strip()
  147. result = text[open_inx + 1:close_inx]
  148. for encloser in diff_enclosers:
  149. open_str, close_str = encloser[0], encloser[1]
  150. open_cnt = text.count(open_str)
  151. close_cnt = text.count(close_str)
  152. open_inx = text.find(open_str)
  153. close_inx = text.rfind(close_str)
  154. if (open_cnt == 1 and close_cnt > 1) or (open_cnt > 1 and close_cnt == 1) or (open_cnt > 1 and close_cnt > 1):
  155. return text.strip()
  156. elif open_cnt == 1 and close_cnt == 1 and close_inx - open_inx > 1:
  157. num += 1
  158. if num > 1:
  159. return text.strip()
  160. result = text[open_inx + 1:close_inx]
  161. return result.strip()
  162. def clean_company_name(name):
  163. if name:
  164. # 特殊字符替换为空格
  165. name = name.translate(special_chars_trans)
  166. # 转大写,去除连续空格,去除首尾空格
  167. name = ' '.join(name.upper().split())
  168. return name
  169. else:
  170. return None
  171. def sub_start_end(main_str, sub_str):
  172. if main_str.startswith(sub_str):
  173. main_str = main_str[len(sub_str):]
  174. if main_str.endswith(sub_str):
  175. main_str = main_str[:-len(sub_str)]
  176. return main_str.strip()
  177. def get_sub_after(main_str, sub_str):
  178. index = main_str.find(sub_str)
  179. if index == -1:
  180. return main_str
  181. return main_str[index + len(sub_str):].strip()
  182. def get_sub_before(main_str, sub_str):
  183. index = main_str.find(sub_str)
  184. if index == -1:
  185. return main_str
  186. return main_str[:index].strip()
  187. def clean_pre_join(name):
  188. if name:
  189. name = name.upper().strip()
  190. for sub_str in sub_after_list:
  191. name = sub_start_end(name, sub_str)
  192. name = get_sub_after(name, sub_str)
  193. name = sub_start_end(name, sub_before_str)
  194. name = get_sub_before(name, sub_before_str)
  195. name = extract_text_from_enclosers(name)
  196. name = clean_company_name(name)
  197. name = sub_head(name)
  198. return name
  199. return None
  200. if __name__ == '__main__':
  201. print(clean_pre_join('ASF INC ON BEH¿ BY ORDER OF'))
  202. if __name__ == '__main__2':
  203. input_str1 = 'a<b>c'
  204. input_str2 = 'a<b>c<d>e<f>gh'
  205. input_str3 = 'a<"x>"b'
  206. input_str4 = 'This <is a test <example> string.'
  207. input_str5 = 'This is a test «aaa» string.'
  208. case_list = [input_str1, input_str2, input_str3, input_str4, input_str5]
  209. case_list.append('sss"adsd"ddd')
  210. case_list.append('This is a test ""aaa» string.')
  211. case_list.append('a<"x">b ')
  212. case_list.append('""abcd')
  213. case_list.append('a>bc<d')
  214. case_list.append('abcd<>')
  215. case_list.append('abcd<bbbb》b>')
  216. case_list.append('abcd<b'b“bb》b>')
  217. for case in case_list:
  218. extract_text = extract_text_from_enclosers(case)
  219. print("{:<50} -> {}".format(case, extract_text))
  220. if __name__ == '__main__1':
  221. case1 = ' AB cde .((!) '
  222. assert clean_company_name(case1) == 'AB CDE'
  223. case2 = None
  224. assert clean_company_name(case2) is None
  225. case3 = ' '
  226. assert clean_company_name(case3) == ''
  227. case4 = '~ab#c≥'
  228. assert clean_company_name(case4) == 'AB C'
  229. case5 = '÷ & ! '
  230. assert clean_company_name(case5) == 'AND'
  231. case6 = 'abc&def'
  232. assert clean_company_name(case6) == 'ABC AND DEF'
  233. case = 'abc&def'
  234. assert clean_company_name(case6) == 'ABC AND DEF'
  235. print('all test cases passed')