common_clean.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. # 通用企业名称去噪
  2. special_chars = ['.',
  3. ',',
  4. '-',
  5. '(',
  6. ')',
  7. '@',
  8. '?',
  9. '‘',
  10. '’',
  11. '“',
  12. '”',
  13. '`',
  14. '#',
  15. '+',
  16. '!',
  17. '$',
  18. '|',
  19. ':',
  20. '/',
  21. ';',
  22. '*',
  23. '《',
  24. '》',
  25. '<',
  26. '>',
  27. '%',
  28. '^',
  29. '_',
  30. '[',
  31. ']',
  32. '{',
  33. '}',
  34. '\\',
  35. '~',
  36. '=',
  37. '\'',
  38. '±',
  39. '°',
  40. '«',
  41. '»',
  42. 'µ',
  43. '¶',
  44. '·',
  45. '€',
  46. '£',
  47. '¥',
  48. '¢',
  49. '×',
  50. '÷',
  51. '¬',
  52. '…',
  53. '→',
  54. '←',
  55. '↑',
  56. '↓',
  57. '↔',
  58. '⇒',
  59. '⇐',
  60. '≈',
  61. '≠',
  62. '≤',
  63. '≥',
  64. '.',
  65. ',',
  66. '-',
  67. '(',
  68. ')',
  69. '@',
  70. '?',
  71. '"',
  72. '\'',
  73. '#',
  74. '+',
  75. '!',
  76. '$',
  77. '|',
  78. ':',
  79. '/',
  80. ';',
  81. '*',
  82. '<',
  83. '>',
  84. '%',
  85. '^',
  86. '_',
  87. '[',
  88. ']',
  89. '{',
  90. '}',
  91. '\',
  92. '~',
  93. '¨',
  94. '´',
  95. '',
  96. '¿',
  97. '‰',
  98. '¯',
  99. ]
  100. special_char_dict = {c: ' ' for c in set(special_chars)}
  101. special_char_dict['&'] = ' and '
  102. special_char_dict['&'] = ' and '
  103. special_chars_trans = str.maketrans(special_char_dict)
  104. head_list = ['MS ', 'M S ']
  105. tail_list = [' I PRIVATE LIMITED',
  106. ' I PRIVATELIMITED',
  107. ' PrivateATE LIMITED',
  108. ' COMPANY LIMITED',
  109. ' PRIVATE LIMITED',
  110. ' PRIVATELIMITED',
  111. ' COMPANY PRIVATE L',
  112. ' COMPANY I PRIVATE L',
  113. ' CO I PRIVATE L',
  114. ' CO PRIVATE L',
  115. ' I PRIVATE L',
  116. ' I PRIVATE',
  117. ' PRIVATE L',
  118. ' COMPANY PVT L',
  119. ' I LIMITED',
  120. ' LIMITED',
  121. ' P LTD',
  122. ' CO I LTD',
  123. ' I LTD',
  124. ' CO I PVT L',
  125. ' CO PVT L',
  126. ' PVT L',
  127. ' LTD',
  128. ' CO I',
  129. ' I PVT L',
  130. ' I PVT',
  131. ' PVT LTD',
  132. ' PVT L',
  133. ' PVT',
  134. ' PRIVATE',
  135. ' CO',
  136. ' INC',
  137. ' I']
  138. special_tail_list = [' CO LIMITED',
  139. ' CO LTD',
  140. ' COLTD']
  141. def sub_head(name):
  142. for head in head_list:
  143. if name.startswith(head):
  144. name = name[len(head):]
  145. break
  146. return name
  147. def sub_tail(name):
  148. for tail in special_tail_list:
  149. no_tail = f'AND{tail}'
  150. if name.endswith(tail):
  151. if name.endswith(no_tail):
  152. return name
  153. else:
  154. return name[:-len(tail)]
  155. for tail in tail_list:
  156. if name.endswith(tail):
  157. return name[:-len(tail)]
  158. return name
  159. def clean_company_name(name):
  160. if name:
  161. # 特殊字符替换为空格
  162. name = name.translate(special_chars_trans)
  163. # 转大写,去除连续空格,去除首尾空格
  164. name = ' '.join(name.upper().split())
  165. return name
  166. else:
  167. return None
  168. def clean_pre_join(name):
  169. o_name = clean_company_name(name)
  170. if not o_name:
  171. return None
  172. name = sub_head(o_name)
  173. name = sub_tail(name)
  174. if len(name) < 8:
  175. return o_name
  176. return name