ent_company_abbr.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. import re
  2. special_chars = ['.',
  3. ',',
  4. '-',
  5. '(',
  6. ')',
  7. '@',
  8. '?',
  9. '‘',
  10. '’',
  11. '“',
  12. '”',
  13. '`',
  14. '#',
  15. '+',
  16. '!',
  17. '$',
  18. '|',
  19. ':',
  20. '/',
  21. ';',
  22. '*',
  23. '《',
  24. '》',
  25. '<',
  26. '>',
  27. '%',
  28. '^',
  29. '_',
  30. '[',
  31. ']',
  32. '{',
  33. '}',
  34. '\\',
  35. '~',
  36. '=',
  37. '\'',
  38. '±',
  39. '°',
  40. '«',
  41. '»',
  42. 'µ',
  43. '¶',
  44. '·',
  45. '€',
  46. '£',
  47. '¥',
  48. '¢',
  49. '×',
  50. '÷',
  51. '¬',
  52. '…',
  53. '→',
  54. '←',
  55. '↑',
  56. '↓',
  57. '↔',
  58. '⇒',
  59. '⇐',
  60. '≈',
  61. '≠',
  62. '≤',
  63. '≥',
  64. '.',
  65. ',',
  66. '-',
  67. '(',
  68. ')',
  69. '@',
  70. '?',
  71. '"',
  72. '\'',
  73. '#',
  74. '+',
  75. '!',
  76. '$',
  77. '|',
  78. ':',
  79. '/',
  80. ';',
  81. '*',
  82. '<',
  83. '>',
  84. '%',
  85. '^',
  86. '_',
  87. '[',
  88. ']',
  89. '{',
  90. '}',
  91. '\',
  92. '~',
  93. '¨',
  94. '´',
  95. '',
  96. '¿',
  97. '‰',
  98. '¯',
  99. '\x1A',
  100. '£',
  101. '>',
  102. '¿',
  103. '«',
  104. '´',
  105. '»',
  106. '°',
  107. '®',
  108. '·',
  109. '¼',
  110. '©',
  111. '¶',
  112. "'",
  113. '"'
  114. ]
  115. special_char_dict = {c: ' ' for c in set(special_chars)}
  116. special_char_dict['&'] = ' and '
  117. special_char_dict['&'] = ' and '
  118. special_chars_trans = str.maketrans(special_char_dict)
  119. ind_head = [
  120. 'THE ',
  121. 'M S',
  122. 'MS'
  123. ]
  124. india_suffix_list = [
  125. ' PRIVATELIMITED',
  126. ' LLP',
  127. ' CO I PVT L',
  128. ' CO PVT L',
  129. ' CO PRIVATE L',
  130. ' CO I LTD',
  131. ' I LTD',
  132. ' I LIMITED',
  133. ' I PVT L',
  134. ' I PRIVATE L',
  135. ' COMPANY PRIVATE L',
  136. ' COMPANY PVT L',
  137. ' P LTD',
  138. ' PRIVATE L',
  139. ' PVT L',
  140. ' CO LTD',
  141. ' CO',
  142. ' INC',
  143. ' CO LIMITED',
  144. ' LTD',
  145. ' LIMITED',
  146. ' CO I',
  147. ' I'
  148. ]
  149. def clean_company_name(name):
  150. if name:
  151. # 特殊字符替换为空格
  152. name = name.translate(special_chars_trans)
  153. # 转大写,去除连续空格,去除首尾空格
  154. name = ' '.join(name.upper().split())
  155. return name
  156. else:
  157. return None
  158. def split_last(text, suffix):
  159. if text:
  160. last_occurrence_index = text.rfind(suffix)
  161. if last_occurrence_index != -1:
  162. return text[:last_occurrence_index]
  163. return text
  164. return None
  165. def india_truncate_at_suffix(text, suffix_list):
  166. for suffix in suffix_list:
  167. if suffix in text:
  168. if (
  169. suffix != ' CO' and suffix != ' INC' and suffix != ' CO LIMITED' and suffix != ' LTD'
  170. and suffix != ' LIMITED' and suffix != ' CO I' and suffix != ' I'
  171. ):
  172. return split_last(text, suffix)
  173. elif suffix == ' CO' and text.endswith(' CO'):
  174. return split_last(text, suffix)
  175. elif suffix == ' INC' and text.endswith(' INC'):
  176. return split_last(text, suffix)
  177. elif suffix == ' CO LIMITED' and ' AND CO LIMITED' not in text:
  178. return split_last(text, suffix)
  179. elif suffix == ' LTD' and text.endswith(' LTD'):
  180. return split_last(text, suffix)
  181. elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
  182. return split_last(text, suffix)
  183. elif suffix == ' CO I' and text.endswith(' CO I'):
  184. return split_last(text, suffix)
  185. elif suffix == ' I' and text.endswith(' I'):
  186. return split_last(text, suffix)
  187. return text
  188. def remove_prefix(text, prefix):
  189. if text.startswith(prefix):
  190. return text[len(prefix):]
  191. return text
  192. def india_company_abbr(company_name):
  193. if company_name:
  194. bak_name = company_name.upper()
  195. # remove_dots_name = remove_dots_from_abbr(bak_name)
  196. company_name = clean_company_name(bak_name)
  197. for head in ind_head:
  198. if company_name.startswith(head):
  199. company_name = remove_prefix(company_name, head)
  200. break
  201. truncated_name = india_truncate_at_suffix(company_name, india_suffix_list)
  202. if (len(truncated_name.strip()) < 8):
  203. return clean_company_name(bak_name)
  204. else:
  205. return truncated_name.strip()
  206. return None
  207. def company_abbr(country_name: str, company_name: str) -> str or None:
  208. if country_name == 'india':
  209. return india_company_abbr(company_name)
  210. def remove_dots_from_abbr(text):
  211. # 定义正则表达式模式
  212. pattern = r'(([A-Z]\.)+) .*'
  213. # 先检查字符串是否符合模式
  214. match = re.search(pattern, text)
  215. if match:
  216. # 如果符合,则提取匹配的部分,并去掉点
  217. matched_text = match.group(1)
  218. # 去掉匹配部分中的点
  219. modified_text = matched_text.replace('.', '')
  220. # 用修改后的部分替换原始匹配部分
  221. result = text.replace(matched_text, modified_text)
  222. return result
  223. else:
  224. # 如果不符合,返回原始字符串
  225. return text
  226. if __name__ == '__main__':
  227. # 示例用法
  228. case_list = ['X.X. XXXXXX',
  229. 'A.A.A. some text B.B.B.B. more text',
  230. 'X.X.X. XXXXXX',
  231. 'K.N. TEXFAB',
  232. 'AAKASH OIL FIELD SERVICES PVT.LTD.',
  233. 'PARVEEN TRADING CO.',
  234. 'KONNET SOLUTIONS PVT. LTD.',
  235. 'DAINICHI COLOR INDIA PVT.LTD.',
  236. 'NOVA IRON & STEEL LTD.',
  237. 'RPA COPPER DISTRIBUTORS PVT.LTD.',
  238. 'DURA AUTO SYSTEMS INDIA PV.LTD.',
  239. 'SPG CORPORATION PVT.LTD.',
  240. 'MESSRS.K. KRISHNAMURTHY BOOKS & PERIODICALS',
  241. 'MALHAR FASHIONS (INDIA) PVT. LTD.',
  242. 'ELITE BREADS PVT. LTD',
  243. 'MINILEC INDIA PVT.LTD.',
  244. 'CALISTA PROPERTIES PVT.LTD.',
  245. 'PRADIP ENTERPRISES LTD.',
  246. 'ESTEE AUTO PRESSINGS PRIVATE LTD.',
  247. 'DR.(MS)BUNTY M.JAVA',
  248. 'INDUSTRADE(PROP.PHADKE SANJAY ARAVIND)',
  249. 'LEDER FX.',
  250. 'PINNACLE TELE SERVICES PVT. LTD.',
  251. 'HARIBHARAT EQUIPMENTS PVT.LTD.',
  252. 'CECáINTERNATIONALáCORPORATIONá(I)áPVT.áLTD.',
  253. 'BRUNOS COMPUTER SOLUTIONS & SOFTWARE PVT. LTD.',
  254. 'DREAMS ENTERPRISES.',
  255. 'SKR FOODS PVT. LTD.',
  256. ]
  257. for case in case_list:
  258. print(case + " ===> " + company_abbr('india', case))