company_abbr.py 54 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556
  1. import sys
  2. import re
  3. import os
  4. abspath = os.path.abspath(__file__)
  5. root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
  6. sys.path.append(root_path)
  7. from dw_base.spark.udf.customs.common_clean import clean_company_name
  8. kaz_enclosers = [('""', '""'), ('"', '"'), ('<<', '>>'), ('?', '?')]
  9. pakistan_suffix_list = [
  10. 'GROUPCOMPANYLIMITED',
  11. 'LIMITEDPARTNERSHIP',
  12. 'CORPORATIONLIMITED',
  13. 'SMCPRIVATE',
  14. 'OFCOMPANY',
  15. 'PRIVATELIMIT',
  16. 'PRIVATECO',
  17. 'LIABILITYCOMPANY',
  18. 'LIMITEDCOMPANY',
  19. 'COMPANYLIMITED',
  20. 'INCORPORAT',
  21. 'CORPORATION',
  22. 'GROUPCOLTD',
  23. 'COMPANYLTD',
  24. 'COLIMITED',
  25. 'GROUPLTD',
  26. 'SMCPVT',
  27. 'PVTLIMIT',
  28. 'PVTCOLTD',
  29. 'PVTLTD',
  30. 'FACTORY',
  31. 'CORPLTD',
  32. 'COMPANY',
  33. 'PTYLTD',
  34. 'AGENCY',
  35. 'OFFICE',
  36. 'CENTER',
  37. 'COLTD',
  38. 'COINC',
  39. 'C0LTD',
  40. 'LIMIT',
  41. 'CORP',
  42. 'LLC',
  43. 'LTD',
  44. 'COLT'
  45. ]
  46. SECOND_AMERICA_SUFFIX_LIST = [
  47. ' UNLIMITED',
  48. ' LIMITED',
  49. ' CO LTD',
  50. ' COMPANY LTD',
  51. ' AND COMPANY',
  52. ' CORPORATION',
  53. ' CORP',
  54. ' COMPANY INC',
  55. ' COMPANY',
  56. ' LLC',
  57. ' CO INC',
  58. ' CO',
  59. ' MD',
  60. ' LTD',
  61. ' INC'
  62. ' LLP',
  63. ' PLC',
  64. ' EST',
  65. ]
  66. third_AMERICA_SUFFIX_LIST = [
  67. ' CORPORATION',
  68. ' COMPANY LTD',
  69. ' COMPANY INC',
  70. ' UNLIMITED',
  71. ' LIMITED',
  72. ' CO LTD',
  73. ' COMPANY',
  74. ' CO INC',
  75. ' CORP',
  76. ' LLC',
  77. ' LTD',
  78. ' INC'
  79. ' LLP',
  80. ' PLC',
  81. ' EST',
  82. ' CO',
  83. ' MD',
  84. ]
  85. first_chile_SUFFIX_LIST = [
  86. ' SPA',
  87. ' S A',
  88. ' SA',
  89. ' LTDA',
  90. ' LIMITADA',
  91. ' LLC',
  92. ' SOCIEDAD ANONIMA',
  93. ' CO LTD',
  94. ' LTD',
  95. ' LIMI',
  96. ' E I R'
  97. ]
  98. first_bangladesh_suffix_list = [
  99. 'CHANGED FROM',
  100. 'CHANGED',
  101. 'CHANGE FROM',
  102. 'CHANGE',
  103. 'EXCHANGE'
  104. ]
  105. ukraine_suffix_first = [
  106. ' М КИЇВ ВУЛ ',
  107. ' ВУЛ '
  108. ]
  109. ukraine_suffix_second = [
  110. ' S R O ',
  111. ' Z O O '
  112. ]
  113. second_bangladesh_suffix_list = [
  114. 'PVT CO LIMITED',
  115. 'PVT LIMITED',
  116. 'LIMITED',
  117. 'PVT LTD',
  118. 'LTD',
  119. 'PVT',
  120. 'CO LTD',
  121. 'CO',
  122. 'PLC'
  123. ]
  124. FIRST_Rwanda_suffix_list = [
  125. 'COMPANY RWANDA LTD',
  126. 'COMPANY LTD',
  127. ' CO LTD',
  128. 'LTD',
  129. 'LIMITED'
  130. ]
  131. FIRST_england_suffix_list = [
  132. ' COMPANY LIMITED',
  133. ' ENTERPRISES LTD',
  134. ' LIMITED',
  135. ' COMPANY',
  136. ' CO LTD',
  137. ' LTD',
  138. ' LLP'
  139. ]
  140. FIRST_philippines_suffix_list = [
  141. ' CO INC',
  142. ' CO LTD',
  143. 'INC',
  144. 'CORPORATION',
  145. 'CORP',
  146. 'LLC',
  147. 'ENTERPRISES',
  148. 'INCORPORATED',
  149. ' CO',
  150. 'PTE LTD',
  151. 'PTY LTD',
  152. 'LTD',
  153. 'GMBH',
  154. 'S R L',
  155. 'SRL'
  156. ]
  157. FIRST_colombia_suffix_list = [
  158. "LIMITADA",
  159. "S A S",
  160. "LITDA",
  161. "LTDA",
  162. "SAS",
  163. "S A",
  164. "LLC"
  165. ]
  166. frist_america_suffix_list = [
  167. 'PRODUCT',
  168. 'UNION OF THE UNITED STATES',
  169. ' FOUNDATION',
  170. 'SA DE CV',
  171. ' UNLIMITED',
  172. ' LIMITED',
  173. 'CENTERS OF AMERICA',
  174. ' AMERICA CORP',
  175. ' USA CORP',
  176. ' CORP',
  177. ' CORPORATION',
  178. 'FOUNDATION',
  179. ' PLLC',
  180. ' LP',
  181. ' PA',
  182. ' CO',
  183. 'ENTERPRISE',
  184. 'COMPANY',
  185. ' AMERICA LLC',
  186. ' AMERICA INC',
  187. ' USA LLC',
  188. ' USA INC',
  189. ' FL LLC',
  190. ' FL INC',
  191. ' 2 LLC',
  192. ' 2 INC',
  193. ' 3 LLC',
  194. ' 3 INC',
  195. ' 2022 LLC',
  196. ' 2022 INC',
  197. ' 2021 LLC',
  198. ' 2021 INC',
  199. ' 2020 LLC',
  200. ' 2020 INC',
  201. ' CO LLC',
  202. ' CO INC',
  203. ' LLC',
  204. ' INC',
  205. ' CO LTD',
  206. ' LTD'
  207. ]
  208. indonesia_suffix_list = [
  209. 'AGENC',
  210. 'COMPANY',
  211. 'DEVELOPMENT',
  212. 'ORGANIZATION',
  213. 'ASSOCIATION',
  214. 'SERVICE',
  215. 'GROUP',
  216. 'PTY LTD',
  217. 'PTY LIMIT',
  218. ' CO LTD',
  219. ' CO LIMIT',
  220. ' PTE LTD',
  221. 'INDONESIA CO',
  222. 'INDONESIA INCORP',
  223. 'INDONESIA LTD',
  224. 'PHILS CO',
  225. 'INDONESIA UNLIMIT',
  226. ' ASIA CO',
  227. ' ASIA UNLIMITED',
  228. 'INCORPORATED',
  229. 'ENTERPRISE',
  230. ' INDONESIA INC',
  231. ' ASIA INC',
  232. ' INDONESIA CO INC',
  233. ' CO',
  234. ' CORP',
  235. 'CORPORATION',
  236. ' INC',
  237. ' INDONESIA',
  238. ' TBK'
  239. ]
  240. venezuela_suffix_list = [
  241. 'S A',
  242. 'C A',
  243. 'R L',
  244. 'R S',
  245. 'F P',
  246. 'S R L',
  247. 'LTD',
  248. 'INC',
  249. 'COMPANY C A',
  250. 'COMPAÑIA ANONIMA',
  251. 'CORPORATION C A',
  252. 'COOPERATIVA',
  253. 'INTERNATIONAL',
  254. 'CORPORACIÓN',
  255. 'REPRESENTACIONES',
  256. 'ASOCIACION CIVIL',
  257. 'FUNDACION'
  258. ]
  259. kaz_heads = ["TOO",
  260. "ООО",
  261. "АО",
  262. "ФХ",
  263. "ИП OOO",
  264. "НПЦ ООО",
  265. "СП OOO",
  266. "ЧП"]
  267. moldova_suffix_list = [
  268. 'ASOCIATIA GOSPODARIILOR TARANESTI',
  269. 'COOPERATIVA DE ÎNTREPRINZATOR',
  270. 'COOPERATIVA DE PRODUCERE',
  271. 'COOPERATIVA DE',
  272. 'COOPERATIVA AGRICOLA DE INTREPRINZATOR',
  273. 'COOPERATIVA AGRICOLA',
  274. 'CENTRUL TEHNIC',
  275. 'COMPANIA',
  276. 'FIRMA COOPERATISTA TEHNICO-STIINTIFICA DE PRODUCTIE',
  277. 'FIRMA DE PRODUCTIE',
  278. 'FIRMA DE PRODUCŢIE ŞI COMERŢ',
  279. 'FIRMA',
  280. 'SOCIETATEA COMERCIALĂ',
  281. 'SOCIETATEA CU RASPUNDERE LIMITATA FIRMA',
  282. 'SOCIETATEA CU RĂSPUNDERE LIMITATĂ',
  283. 'SOCIETATEA CU RASPUNDERE LIMITATA',
  284. 'SOCIETATEA PE ACTIUNI',
  285. 'SOCIETATEA IN NUME COLECTIV AGENTIA',
  286. 'INTREPRINDEREA INDIVIDUALA',
  287. 'ÎNTREPRINZĂTOR INDIVIDUAL',
  288. 'ÎNTREPRINDEREA INDIVIDUALĂ',
  289. 'ÎNTREPRINDEREA MUNICIPALĂ',
  290. 'ÎNTREPRINDEREA CU CAPITAL STRĂIN',
  291. 'INSTITUŢIA MEDICO-SANITARĂ PUBLICĂ',
  292. 'REDACTIA GAZETEI',
  293. 'ORGANIZATIA DE ADMINISTRARE FIDUCIARA A INVESTITILOR',
  294. 'S R L',
  295. 'SOCIETATEA CU RESPONSABILITATE LIMITATA',
  296. 'SOCIETATE CU RĂSPUNDERE LIMITATĂ'
  297. ]
  298. moldova_suffix_list2 = [
  299. 'S R L',
  300. 'SOCIETATEA CU RESPONSABILITATE LIMITATA',
  301. 'SOCIETATE CU RĂSPUNDERE LIMITATĂ'
  302. ]
  303. singapore_suffix_list = [
  304. 'SINGAPORE PTE LTD',
  305. 'S PTE LTD',
  306. 'PTE LTD',
  307. 'ENTERPRISES',
  308. 'ENTERPRISE',
  309. 'ENT',
  310. 'AGENCIES',
  311. 'AGENCY',
  312. 'PRIVATE LIMITED',
  313. 'COMPANY',
  314. 'LLP',
  315. 'CO'
  316. ]
  317. hongkong_suffix_list = [
  318. ' CO LIMITED',
  319. ' LIMITED',
  320. ' CO LTD',
  321. ' COMPANY',
  322. ' LTD'
  323. ]
  324. china_suffix_list = [
  325. ' GROUP CORPORATION LIMITED',
  326. ' CORPORATION LIMITED',
  327. ' GROUP CORPORATION',
  328. ' GROUP CO LIMITED',
  329. ' LIMITED COMPANY',
  330. ' COMPANY LIMITED',
  331. ' GROUP CO LTD',
  332. ' CORPORATION',
  333. ' CO LIMITED',
  334. ' GROUP CORP',
  335. ' CORP LTD',
  336. ' LIMITED',
  337. ' COMPANY',
  338. ' FACTORY',
  339. ' CO LTD',
  340. ' CO INC',
  341. ' CORP',
  342. ' INC',
  343. ' CO'
  344. ]
  345. vietnam_right_separator_list = [
  346. 'COMPANY LIMITED ',
  347. 'COMPANY LTD '
  348. ]
  349. vietnam_left_separator_list = [
  350. ' CO LTD',
  351. ' PTE LTD',
  352. ' JOINT STOCK COMPANY',
  353. ' COMPANY'
  354. ]
  355. vietnam_suffix_list = [
  356. ' CORP',
  357. ' LLC',
  358. ' CO JSC',
  359. ' JSC',
  360. ' LTD'
  361. ]
  362. ind_head = [
  363. 'M S',
  364. 'MS'
  365. ]
  366. india_suffix_list = [
  367. ' CO I PVT L',
  368. ' CO PVT L',
  369. ' CO PRIVATE L',
  370. ' CO I LTD',
  371. ' I LTD',
  372. ' I LIMITED',
  373. ' I PVT L',
  374. ' I PRIVATE L',
  375. ' COMPANY PRIVATE L',
  376. ' COMPANY PVT L',
  377. ' P LTD',
  378. ' PRIVATE L',
  379. ' PVT L',
  380. ' CO',
  381. ' INC',
  382. ' CO LIMITED',
  383. ' LTD',
  384. ' LIMITED',
  385. ' CO I',
  386. ' I'
  387. ]
  388. mexico_suffix_list = [
  389. ' S P R DE R L DE C V',
  390. ' S DE R L DE C V',
  391. ' S DE RL DE CV',
  392. ' S A P I DE CV',
  393. ' S P R DE R L',
  394. ' S A DE C V',
  395. ' SA DE CV'
  396. ]
  397. nigeria_suffix_list = [
  398. ' COMPANY LIMITED',
  399. ' COMPANY LTD',
  400. ' COMPANY',
  401. ' LIMITED',
  402. ' PTE LTD',
  403. ' CO LTD',
  404. ' LTD',
  405. ' LLC'
  406. ]
  407. peru_suffix_list = [
  408. 'SOCIEDAD ANONIMA CERRADA',
  409. 'SOCIEDAD ANONIMA CER',
  410. 'E I R LTDA',
  411. 'S R LTDA',
  412. 'E I R L',
  413. 'S R L',
  414. 'S A C',
  415. 'SAC',
  416. 'S A'
  417. ]
  418. lesotho_suffix_list = [
  419. ' LLC (EXTERNAL COMPANY) LTD',
  420. ' LLC (EXTERNAL COMPANY)',
  421. ' (PROPRIETARY) LIMITED',
  422. ' COMPANY (PTY) LTD',
  423. ' COMPANY LIMITED',
  424. ' COMPANY LTD',
  425. ' LIMITED',
  426. ' PTY LTD',
  427. ' CO LTD'
  428. ]
  429. germany_suffix_list = [
  430. 'GMBH AND CO KGAA',
  431. 'GMBH AND CO OHG',
  432. 'GMBH AND CO KG',
  433. 'AG AND CO KGAA',
  434. 'AG AND CO OHG',
  435. 'LIMITED ŞTI',
  436. 'GMBH AND CO',
  437. 'S A DE C V',
  438. 'CO LIMITED',
  439. 'LIMITED',
  440. 'S R L',
  441. 'GMBH',
  442. 'GBR',
  443. 'SRL',
  444. 'INC',
  445. 'LLC',
  446. 'OHG',
  447. 'A S',
  448. 'E K',
  449. 'AG',
  450. 'SA',
  451. 'UG'
  452. ]
  453. def kaz_extract_text_from_enclosers(text):
  454. result = text
  455. for encloser in kaz_enclosers:
  456. open_str, close_str = encloser[0], encloser[1]
  457. open_inx = text.find(open_str)
  458. close_inx = text.rfind(close_str)
  459. if close_inx - open_inx > 1:
  460. return text[open_inx + 1:close_inx]
  461. return result
  462. def remove_prefix(text, prefix):
  463. if text.startswith(prefix):
  464. return text[len(prefix):]
  465. return text
  466. def truncate_at_suffix(text, suffix_list):
  467. for suffix in suffix_list:
  468. if suffix in text:
  469. parts = text.split(suffix, 1)
  470. return parts[0]
  471. return text
  472. def pakistan_company_abbr(company_name: str) -> str or None:
  473. if company_name:
  474. upper_name = company_name.upper()
  475. cleaned_name = re.sub(r'[^A-Z0-9]', '', upper_name)
  476. removed_prefix_name = remove_prefix(cleaned_name, 'ms')
  477. truncated_name = truncate_at_suffix(removed_prefix_name, pakistan_suffix_list).strip()
  478. if len(truncated_name) > 4:
  479. return truncated_name
  480. elif len(removed_prefix_name) > 4:
  481. return removed_prefix_name
  482. return None
  483. def mirror_pakistan_company_abbr(company_name: str) -> str or None:
  484. if company_name:
  485. upper_name = company_name.upper()
  486. cleaned_name = re.sub(r'[^A-Z0-9 ]', '', upper_name)
  487. removed_prefix_name = remove_prefix(cleaned_name, 'ms').strip()
  488. truncated_name = truncate_at_suffix(removed_prefix_name, pakistan_suffix_list).strip()
  489. if len(truncated_name) > 4:
  490. return truncated_name
  491. elif len(removed_prefix_name) > 4:
  492. return removed_prefix_name
  493. return None
  494. def split_last(text, suffix):
  495. if text:
  496. last_occurrence_index = text.rfind(suffix)
  497. if last_occurrence_index != -1:
  498. return text[:last_occurrence_index]
  499. return text
  500. return None
  501. # 纳米比亚进口的mc_org处理逻辑
  502. def split_first_dtp(text):
  503. if text:
  504. if " ---DTP" in text:
  505. return text.split(" ---DTP", 1)[0]
  506. elif "---DTP" in text:
  507. return text.split("---DTP", 1)[0]
  508. elif "--DTP" in text:
  509. return text.split("--DTP", 1)[0]
  510. else:
  511. return text
  512. return None
  513. def america_truncate_at_suffix_first(text, suffix_list):
  514. for suffix in suffix_list:
  515. if suffix in text:
  516. if (suffix != ' FOUNDATION' and suffix != ' UNLIMITED'
  517. and suffix != ' AMERICA CORP' and suffix != ' USA CORP' and suffix != ' CORP'
  518. and suffix != ' CORPORATION' and suffix != 'FOUNDATION'
  519. and suffix != ' PLLC' and suffix != ' LP' and suffix != ' PA' and suffix != ' CO' and suffix != 'ENTERPRISE'
  520. and suffix != 'COMPANY'
  521. and suffix != ' LLC' and suffix != ' INC'):
  522. return split_last(text, suffix)
  523. elif suffix == ' FOUNDATION' and text.endswith(' FOUNDATION'):
  524. return split_last(text, suffix)
  525. elif suffix == ' UNLIMITED' and text.endswith(' UNLIMITED'):
  526. return split_last(text, suffix)
  527. elif suffix == ' AMERICA CORP' and text.endswith(' AMERICA CORP'):
  528. return split_last(text, suffix)
  529. elif suffix == ' USA CORP' and text.endswith(' USA CORP'):
  530. return split_last(text, suffix)
  531. elif suffix == ' CORP' and text.endswith(' CORP'):
  532. return split_last(text, suffix)
  533. elif suffix == ' CORPORATION' and text.endswith(' CORPORATION'):
  534. return split_last(text, suffix)
  535. elif suffix == 'FOUNDATION' and text.endswith('FOUNDATION'):
  536. return split_last(text, suffix)
  537. elif suffix == ' PLLC' and text.endswith(' PLLC'):
  538. return split_last(text, suffix)
  539. elif suffix == ' LP' and text.endswith(' LP'):
  540. return split_last(text, suffix)
  541. elif suffix == ' PA' and text.endswith(' PA'):
  542. return split_last(text, suffix)
  543. elif suffix == ' CO' and text.endswith(' CO'):
  544. return split_last(text, suffix)
  545. elif suffix == 'ENTERPRISE' and text.endswith('ENTERPRISE'):
  546. return split_last(text, suffix)
  547. elif suffix == 'COMPANY' and text.endswith('COMPANY'):
  548. return split_last(text, suffix)
  549. elif suffix == ' LLC' and text.endswith(' LLC'):
  550. return split_last(text, suffix)
  551. elif suffix == ' INC' and text.endswith(' INC'):
  552. return split_last(text, suffix)
  553. return text
  554. def america_truncate_at_suffix_second(text, suffix_list):
  555. for suffix in suffix_list:
  556. if suffix in text:
  557. if (suffix != ' UNLIMITED' and suffix != ' LIMITED'
  558. and suffix != ' AND COMPANY' and suffix != ' CORPORATION' and suffix != ' CORP'
  559. and suffix != ' COMPANY' and suffix != ' LLC'
  560. and suffix != ' CO'
  561. and suffix != ' MD' and suffix != ' LTD' and suffix != ' INC'
  562. and suffix != ' PLC' and suffix != ' LLP' and suffix != ' EST'
  563. ):
  564. return split_last(text, suffix)
  565. elif suffix == ' UNLIMITED' and text.endswith(' UNLIMITED'):
  566. return split_last(text, suffix)
  567. elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
  568. return split_last(text, suffix)
  569. elif suffix == ' AND COMPANY' and text.endswith(' AND COMPANY'):
  570. return split_last(text, suffix)
  571. elif suffix == ' CORPORATION' and text.endswith(' CORPORATION'):
  572. return split_last(text, suffix)
  573. elif suffix == ' CORP' and text.endswith(' CORP'):
  574. return split_last(text, suffix)
  575. elif suffix == ' COMPANY' and text.endswith(' COMPANY'):
  576. return split_last(text, suffix)
  577. elif suffix == ' LLC' and text.endswith(' LLC'):
  578. return split_last(text, suffix)
  579. elif suffix == ' CO' and text.endswith(' CO'):
  580. return split_last(text, suffix)
  581. elif suffix == ' MD' and text.endswith(' MD'):
  582. return split_last(text, suffix)
  583. elif suffix == ' LTD' and text.endswith(' LTD'):
  584. return split_last(text, suffix)
  585. elif suffix == ' INC' and text.endswith(' INC'):
  586. return split_last(text, suffix)
  587. elif suffix == ' LLP' and text.endswith(' LLP'):
  588. return split_last(text, suffix)
  589. elif suffix == ' PLC' and text.endswith(' PLC'):
  590. return split_last(text, suffix)
  591. elif suffix == ' EST' and text.endswith(' EST'):
  592. return split_last(text, suffix)
  593. return text
  594. def america_truncate_at_suffix_third(text, suffix_list):
  595. for suffix in suffix_list:
  596. if suffix in text:
  597. if (suffix != ' UNLIMITED' and suffix != ' LIMITED'
  598. and suffix != ' CORPORATION' and suffix != ' CORP'
  599. and suffix != ' COMPANY' and suffix != ' LLC'
  600. and suffix != ' CO'
  601. and suffix != ' MD' and suffix != ' LTD' and suffix != ' INC'
  602. and suffix != ' PLC' and suffix != ' LLP' and suffix != ' EST'
  603. ):
  604. return split_last(text, suffix)
  605. elif suffix == ' CORPORATION' and text.endswith(' CORPORATION'):
  606. return split_last(text, suffix)
  607. elif suffix == ' UNLIMITED' and text.endswith(' UNLIMITED'):
  608. return split_last(text, suffix)
  609. elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
  610. return split_last(text, suffix)
  611. elif suffix == ' COMPANY' and text.endswith(' COMPANY'):
  612. return split_last(text, suffix)
  613. elif suffix == ' CORP' and text.endswith(' CORP'):
  614. return split_last(text, suffix)
  615. elif suffix == ' LLC' and text.endswith(' LLC'):
  616. return split_last(text, suffix)
  617. elif suffix == ' LTD' and text.endswith(' LTD'):
  618. return split_last(text, suffix)
  619. elif suffix == ' INC' and text.endswith(' INC'):
  620. return split_last(text, suffix)
  621. elif suffix == ' LLP' and text.endswith(' LLP'):
  622. return split_last(text, suffix)
  623. elif suffix == ' PLC' and text.endswith(' PLC'):
  624. return split_last(text, suffix)
  625. elif suffix == ' EST' and text.endswith(' EST'):
  626. return split_last(text, suffix)
  627. elif suffix == ' CO' and text.endswith(' CO'):
  628. return split_last(text, suffix)
  629. elif suffix == ' MD' and text.endswith(' MD'):
  630. return split_last(text, suffix)
  631. return text
  632. def bangladesh_truncate_at_suffix_first(text, suffix_list):
  633. for suffix in suffix_list:
  634. if suffix in text:
  635. if (suffix != 'CHANGED') and suffix != 'CHANGE' and suffix != 'EXCHANGE':
  636. return split_last(text, suffix)
  637. elif suffix == 'CHANGED' and text.endswith('CHANGED'):
  638. return split_last(text, suffix)
  639. elif suffix == 'CHANGE' and text.endswith('CHANGE'):
  640. return split_last(text, suffix)
  641. elif suffix == 'EXCHANGE' and text.endswith('EXCHANGE'):
  642. return split_last(text, suffix)
  643. return text
  644. def bangladesh_truncate_at_suffix_second(text, suffix_list):
  645. for suffix in suffix_list:
  646. if suffix in text:
  647. if suffix == 'PVT CO LIMITED' and text.endswith('PVT CO LIMITED'):
  648. return split_last(text, suffix)
  649. elif suffix == 'PVT LIMITED' and text.endswith('PVT LIMITED'):
  650. return split_last(text, suffix)
  651. elif suffix == 'LIMITED' and text.endswith('LIMITED'):
  652. return split_last(text, suffix)
  653. elif suffix == 'PVT LTD' and text.endswith('PVT LTD'):
  654. return split_last(text, suffix)
  655. elif suffix == 'LTD' and text.endswith('LTD'):
  656. return split_last(text, suffix)
  657. elif suffix == 'PVT' and text.endswith('PVT'):
  658. return split_last(text, suffix)
  659. elif suffix == 'LTD' and text.endswith('LTD'):
  660. return split_last(text, suffix)
  661. elif suffix == 'CO' and text.endswith('CO'):
  662. return split_last(text, suffix)
  663. elif suffix == 'PLC' and text.endswith('PLC'):
  664. return split_last(text, suffix)
  665. elif suffix == 'PVT':
  666. return split_last(text, suffix)
  667. return text
  668. def indonesia_truncate_at_suffix(text, suffix_list):
  669. for suffix in suffix_list:
  670. if suffix in text:
  671. if (suffix != ' CO' and suffix != ' CORP' and suffix != 'CORPORATION' and suffix != ' INC'
  672. and suffix != ' INDONESIA' and suffix != ' TBK'):
  673. return split_last(text, suffix)
  674. elif suffix == ' CO' and text.endswith(' CO'):
  675. return split_last(text, suffix)
  676. elif suffix == ' CORP' and text.endswith(' CORP'):
  677. return split_last(text, suffix)
  678. elif suffix == 'CORPORATION' and text.endswith('CORPORATION'):
  679. return split_last(text, suffix)
  680. elif suffix == ' INC' and text.endswith(' INC'):
  681. return split_last(text, suffix)
  682. elif suffix == ' INDONESIA' and text.endswith(' INDONESIA'):
  683. return split_last(text, suffix)
  684. elif suffix == ' TBK' and text.endswith(' TBK'):
  685. return split_last(text, suffix)
  686. return text
  687. def rwanda_truncate_at_suffix(text, suffix_list):
  688. for suffix in suffix_list:
  689. if suffix in text:
  690. if (suffix != 'COMPANY RWANDA LTD' and suffix != 'COMPANY LTD' and suffix != 'CO LTD'):
  691. return split_last(text, suffix)
  692. elif suffix == 'COMPANY RWANDA LTD' and text.endswith('COMPANY RWANDA LTD'):
  693. return split_last(text, suffix)
  694. elif suffix == 'COMPANY LTD' and text.endswith('COMPANY LTD'):
  695. return split_last(text, suffix)
  696. elif suffix == 'CO LTD' and text.endswith('CO LTD'):
  697. return split_last(text, suffix)
  698. return text
  699. def philippines_truncate_at_suffix(text, suffix_list):
  700. for suffix in suffix_list:
  701. if suffix in text:
  702. if text.endswith(suffix):
  703. return split_last(text, suffix)
  704. return text
  705. def england_truncate_at_suffix(text, suffix_list):
  706. for suffix in suffix_list:
  707. if suffix in text:
  708. if text.endswith(suffix):
  709. return split_last(text, suffix)
  710. return text
  711. def colombia_truncate_at_suffix(text, suffix_list):
  712. for suffix in suffix_list:
  713. if suffix in text:
  714. if text.endswith(suffix):
  715. return split_last(text, suffix)
  716. return text
  717. def chile_truncate_at_suffix(text, suffix_list):
  718. for suffix in suffix_list:
  719. if suffix in text:
  720. if (suffix != ' SPA' and suffix != ' S A' and suffix != ' SA' and suffix != ' LTDA'
  721. and suffix != ' LIMITADA' and suffix != ' LLC'
  722. and suffix != ' SOCIEDAD ANONIMA' and suffix != ' CO LTD' and suffix != ' LTD' and suffix != ' LIMI'
  723. and suffix != ' E I R'):
  724. return split_last(text, suffix)
  725. elif suffix == ' SPA' and text.endswith(' SPA'):
  726. return split_last(text, suffix)
  727. elif suffix == ' S A' and text.endswith(' S A'):
  728. return split_last(text, suffix)
  729. elif suffix == ' SA' and text.endswith(' SA'):
  730. return split_last(text, suffix)
  731. elif suffix == ' LTDA' and text.endswith(' LTDA'):
  732. return split_last(text, suffix)
  733. elif suffix == ' LIMITADA' and text.endswith(' LIMITADA'):
  734. return split_last(text, suffix)
  735. elif suffix == ' LLC' and text.endswith(' LLC'):
  736. return split_last(text, suffix)
  737. elif suffix == ' SOCIEDAD ANONIMA' and text.endswith(' SOCIEDAD ANONIMA'):
  738. return split_last(text, suffix)
  739. elif suffix == ' CO LTD' and text.endswith(' CO LTD'):
  740. return split_last(text, suffix)
  741. elif suffix == ' LTD' and text.endswith(' LTD'):
  742. return split_last(text, suffix)
  743. elif suffix == ' LIMI' and text.endswith(' LIMI'):
  744. return split_last(text, suffix)
  745. elif suffix == ' E I R' and text.endswith(' E I R'):
  746. return split_last(text, suffix)
  747. return text
  748. def venezuela_truncate_at_suffix(text, suffix_list):
  749. for suffix in suffix_list:
  750. if suffix in text:
  751. if (
  752. suffix != 'S A' and suffix != 'C A' and suffix != 'R L' and suffix != 'R S' and suffix != 'F P' and suffix != 'S R L'
  753. and suffix != 'INC' and suffix != 'COMPANY C A' and suffix != 'COMPAÑIA ANONIMA' and suffix != 'CORPORATION C A'
  754. and suffix != 'COOPERATIVA' and suffix != 'INTERNATIONAL' and suffix != 'CORPORACIÓN' and suffix != 'REPRESENTACIONES'
  755. and suffix != 'ASOCIACION CIVIL' and suffix != 'FUNDACION'
  756. ):
  757. return split_last(text, suffix)
  758. elif suffix == 'S A' and text.endswith('S A'):
  759. return split_last(text, suffix)
  760. elif suffix == 'C A' and text.endswith('C A'):
  761. return split_last(text, suffix)
  762. elif suffix == 'R L' and text.endswith('R L'):
  763. return split_last(text, suffix)
  764. elif suffix == 'R S' and text.endswith('R S'):
  765. return split_last(text, suffix)
  766. elif suffix == 'F P' and text.endswith('F P'):
  767. return split_last(text, suffix)
  768. elif suffix == 'S R L' and text.endswith('S R L'):
  769. return split_last(text, suffix)
  770. elif suffix == 'INC' and text.endswith('INC'):
  771. return split_last(text, suffix)
  772. elif suffix == 'COMPANY C A' and text.endswith('COMPANY C A'):
  773. return split_last(text, suffix)
  774. elif suffix == 'COMPAÑIA ANONIMA' and text.endswith('COMPAÑIA ANONIMA'):
  775. return split_last(text, suffix)
  776. elif suffix == 'CORPORATION C A' and text.endswith('CORPORATION C A'):
  777. return split_last(text, suffix)
  778. elif suffix == 'COOPERATIVA' and text.startswith('COOPERATIVA'):
  779. return text.split(suffix, 1)[1]
  780. elif suffix == 'INTERNATIONAL' and text.startswith('INTERNATIONAL'):
  781. return text.split(suffix, 1)[1]
  782. elif suffix == 'CORPORACIÓN' and text.startswith('CORPORACIÓN'):
  783. return text.split(suffix, 1)[1]
  784. elif suffix == 'REPRESENTACIONES' and text.startswith('REPRESENTACIONES'):
  785. return text.split(suffix, 1)[1]
  786. elif suffix == 'ASOCIACION CIVIL' and text.startswith('ASOCIACION CIVIL'):
  787. return text.split(suffix, 1)[1]
  788. elif suffix == 'FUNDACION' and text.startswith('FUNDACION'):
  789. return text.split(suffix, 1)[1]
  790. return text
  791. def moldova_truncate_at_suffix(text, suffix_list):
  792. for suffix in suffix_list:
  793. if suffix in text:
  794. if suffix == 'ASOCIATIA GOSPODARIILOR TARANESTI' and text.startswith('ASOCIATIA GOSPODARIILOR TARANESTI'):
  795. return text.split(suffix, 1)[1]
  796. elif suffix == 'COOPERATIVA DE ÎNTREPRINZATOR' and text.startswith('COOPERATIVA DE ÎNTREPRINZATOR'):
  797. return text.split(suffix, 1)[1]
  798. elif suffix == 'COOPERATIVA DE PRODUCERE' and text.startswith('COOPERATIVA DE PRODUCERE'):
  799. return text.split(suffix, 1)[1]
  800. elif suffix == 'COOPERATIVA DE' and text.startswith('COOPERATIVA DE'):
  801. return text.split(suffix, 1)[1]
  802. elif suffix == 'COOPERATIVA AGRICOLA DE INTREPRINZATOR' and text.startswith(
  803. 'COOPERATIVA AGRICOLA DE INTREPRINZATOR'):
  804. return text.split(suffix, 1)[1]
  805. elif suffix == 'COOPERATIVA AGRICOLA' and text.startswith('COOPERATIVA AGRICOLA'):
  806. return text.split(suffix, 1)[1]
  807. elif suffix == 'CENTRUL TEHNIC' and text.startswith('CENTRUL TEHNIC'):
  808. return text.split(suffix, 1)[1]
  809. elif suffix == 'COMPANIA' and text.startswith('COMPANIA'):
  810. return text.split(suffix, 1)[1]
  811. elif suffix == 'FIRMA COOPERATISTA TEHNICO-STIINTIFICA DE PRODUCTIE' and text.startswith(
  812. 'FIRMA COOPERATISTA TEHNICO-STIINTIFICA DE PRODUCTIE'):
  813. return text.split(suffix, 1)[1]
  814. elif suffix == 'FIRMA DE PRODUCTIE' and text.startswith('FIRMA DE PRODUCTIE'):
  815. return text.split(suffix, 1)[1]
  816. elif suffix == 'FIRMA DE PRODUCŢIE ŞI COMERŢ' and text.startswith('FIRMA DE PRODUCŢIE ŞI COMERŢ'):
  817. return text.split(suffix, 1)[1]
  818. elif suffix == 'FIRMA' and text.startswith('FIRMA'):
  819. return text.split(suffix, 1)[1]
  820. elif suffix == 'SOCIETATEA COMERCIALĂ' and text.startswith('SOCIETATEA COMERCIALĂ'):
  821. return text.split(suffix, 1)[1]
  822. elif suffix == 'SOCIETATEA CU RASPUNDERE LIMITATA FIRMA' and text.startswith(
  823. 'SOCIETATEA CU RASPUNDERE LIMITATA FIRMA'):
  824. return text.split(suffix, 1)[1]
  825. elif suffix == 'SOCIETATEA CU RĂSPUNDERE LIMITATĂ' and text.startswith('SOCIETATEA CU RĂSPUNDERE LIMITATĂ'):
  826. return text.split(suffix, 1)[1]
  827. elif suffix == 'SOCIETATEA CU RASPUNDERE LIMITATA' and text.startswith('SOCIETATEA CU RASPUNDERE LIMITATA'):
  828. return text.split(suffix, 1)[1]
  829. elif suffix == 'SOCIETATEA PE ACTIUNI' and text.startswith('SOCIETATEA PE ACTIUNI'):
  830. return text.split(suffix, 1)[1]
  831. elif suffix == 'SOCIETATEA IN NUME COLECTIV AGENTIA' and text.startswith(
  832. 'SOCIETATEA IN NUME COLECTIV AGENTIA'):
  833. return text.split(suffix, 1)[1]
  834. elif suffix == 'INTREPRINDEREA INDIVIDUALA' and text.startswith('INTREPRINDEREA INDIVIDUALA'):
  835. return text.split(suffix, 1)[1]
  836. elif suffix == 'ÎNTREPRINZĂTOR INDIVIDUAL' and text.startswith('ÎNTREPRINZĂTOR INDIVIDUAL'):
  837. return text.split(suffix, 1)[1]
  838. elif suffix == 'ÎNTREPRINDEREA INDIVIDUALĂ' and text.startswith('ÎNTREPRINDEREA INDIVIDUALĂ'):
  839. return text.split(suffix, 1)[1]
  840. elif suffix == 'ÎNTREPRINDEREA MUNICIPALĂ' and text.startswith('ÎNTREPRINDEREA MUNICIPALĂ'):
  841. return text.split(suffix, 1)[1]
  842. elif suffix == 'ÎNTREPRINDEREA CU CAPITAL STRĂIN' and text.startswith('ÎNTREPRINDEREA CU CAPITAL STRĂIN'):
  843. return text.split(suffix, 1)[1]
  844. elif suffix == 'INSTITUŢIA MEDICO-SANITARĂ PUBLICĂ' and text.startswith(
  845. 'INSTITUŢIA MEDICO-SANITARĂ PUBLICĂ'):
  846. return text.split(suffix, 1)[1]
  847. elif suffix == 'REDACTIA GAZETEI' and text.startswith('REDACTIA GAZETEI'):
  848. return text.split(suffix, 1)[1]
  849. elif suffix == 'ORGANIZATIA DE ADMINISTRARE FIDUCIARA A INVESTITILOR' and text.startswith(
  850. 'ORGANIZATIA DE ADMINISTRARE FIDUCIARA A INVESTITILOR'):
  851. return text.split(suffix, 1)[1]
  852. elif suffix == 'S R L' and text.endswith('S R L'):
  853. return split_last(text, suffix)
  854. elif suffix == 'SOCIETATEA CU RESPONSABILITATE LIMITATA' and text.endswith(
  855. 'SOCIETATEA CU RESPONSABILITATE LIMITATA'):
  856. return split_last(text, suffix)
  857. elif suffix == 'SOCIETATE CU RĂSPUNDERE LIMITATĂ' and text.endswith('SOCIETATE CU RĂSPUNDERE LIMITATĂ'):
  858. return split_last(text, suffix)
  859. return text
  860. def moldova_truncate_at_suffix_second(text, suffix_list2):
  861. for suffix in suffix_list2:
  862. if suffix in text:
  863. if suffix == 'S R L' and text.endswith('S R L'):
  864. return split_last(text, suffix)
  865. elif suffix == 'SOCIETATEA CU RESPONSABILITATE LIMITATA' and text.endswith(
  866. 'SOCIETATEA CU RESPONSABILITATE LIMITATA'):
  867. return split_last(text, suffix)
  868. elif suffix == 'SOCIETATE CU RĂSPUNDERE LIMITATĂ' and text.endswith('SOCIETATE CU RĂSPUNDERE LIMITATĂ'):
  869. return split_last(text, suffix)
  870. return text
  871. def singapore_truncate_at_suffix(text, suffix_list):
  872. for suffix in suffix_list:
  873. if suffix in text:
  874. if suffix == 'SINGAPORE PTE LTD' and text.endswith('SINGAPORE PTE LTD'):
  875. return split_last(text, suffix)
  876. elif suffix == 'S PTE LTD' and text.endswith('S PTE LTD'):
  877. return split_last(text, suffix)
  878. elif suffix == 'PTE LTD' and text.endswith('PTE LTD'):
  879. return split_last(text, suffix)
  880. elif suffix == 'ENTERPRISES' and text.endswith('ENTERPRISES'):
  881. return split_last(text, suffix)
  882. elif suffix == 'ENTERPRISE' and text.endswith('ENTERPRISE'):
  883. return split_last(text, suffix)
  884. elif suffix == 'ENT' and text.endswith('ENT'):
  885. return split_last(text, suffix)
  886. elif suffix == 'AGENCIES' and text.endswith('AGENCIES'):
  887. return split_last(text, suffix)
  888. elif suffix == 'AGENCY' and text.endswith('AGENCY'):
  889. return split_last(text, suffix)
  890. elif suffix == 'PRIVATE LIMITED' and text.endswith('PRIVATE LIMITED'):
  891. return split_last(text, suffix)
  892. elif suffix == 'COMPANY' and text.endswith('COMPANY'):
  893. return split_last(text, suffix)
  894. elif suffix == 'LLP' and text.endswith('LLP'):
  895. return split_last(text, suffix)
  896. elif suffix == 'CO' and text.endswith('CO'):
  897. return split_last(text, suffix)
  898. return text
  899. def hongkong_truncate_at_suffix(text, suffix_list):
  900. for suffix in suffix_list:
  901. if suffix in text:
  902. if suffix == ' CO LIMITED' and text.endswith(' CO LIMITED'):
  903. return split_last(text, suffix)
  904. elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
  905. return split_last(text, suffix)
  906. elif suffix == ' CO LTD' and text.endswith(' CO LTD'):
  907. return split_last(text, suffix)
  908. elif suffix == ' COMPANY' and text.endswith(' COMPANY'):
  909. return split_last(text, suffix)
  910. elif suffix == ' LTD' and text.endswith(' LTD'):
  911. return split_last(text, suffix)
  912. return text
  913. def china_truncate_at_suffix(text, suffix_list):
  914. for suffix in suffix_list:
  915. if suffix in text:
  916. if suffix == ' GROUP CORPORATION LIMITED' and text.endswith(' GROUP CORPORATION LIMITED'):
  917. return split_last(text, suffix)
  918. elif suffix == ' CORPORATION LIMITED' and text.endswith(' CORPORATION LIMITED'):
  919. return split_last(text, suffix)
  920. elif suffix == ' GROUP CORPORATION' and text.endswith(' GROUP CORPORATION'):
  921. return split_last(text, suffix)
  922. elif suffix == ' GROUP CO LIMITED' and text.endswith(' GROUP CO LIMITED'):
  923. return split_last(text, suffix)
  924. elif suffix == ' LIMITED COMPANY' and text.endswith(' LIMITED COMPANY'):
  925. return split_last(text, suffix)
  926. elif suffix == ' COMPANY LIMITED' and text.endswith(' COMPANY LIMITED'):
  927. return split_last(text, suffix)
  928. elif suffix == ' GROUP CO LTD' and text.endswith(' GROUP CO LTD'):
  929. return split_last(text, suffix)
  930. elif suffix == ' CORPORATION' and text.endswith(' CORPORATION'):
  931. return split_last(text, suffix)
  932. elif suffix == ' CO LIMITED' and text.endswith(' CO LIMITED'):
  933. return split_last(text, suffix)
  934. elif suffix == ' GROUP CORP' and text.endswith(' GROUP CORP'):
  935. return split_last(text, suffix)
  936. elif suffix == ' CORP LTD' and text.endswith(' CORP LTD'):
  937. return split_last(text, suffix)
  938. elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
  939. return split_last(text, suffix)
  940. elif suffix == ' COMPANY' and text.endswith(' COMPANY'):
  941. return split_last(text, suffix)
  942. elif suffix == ' FACTORY' and text.endswith(' FACTORY'):
  943. return split_last(text, suffix)
  944. elif suffix == ' CO LTD' and text.endswith(' CO LTD'):
  945. return split_last(text, suffix)
  946. elif suffix == ' CO INC' and text.endswith(' CO INC'):
  947. return split_last(text, suffix)
  948. elif suffix == ' CORP' and text.endswith(' CORP'):
  949. return split_last(text, suffix)
  950. elif suffix == ' INC' and text.endswith(' INC'):
  951. return split_last(text, suffix)
  952. elif suffix == ' CO' and text.endswith(' CO'):
  953. return split_last(text, suffix)
  954. return text
  955. def vietnam_take_right_half(company_name: str):
  956. for separator in vietnam_right_separator_list:
  957. if separator in company_name:
  958. return company_name.split(separator, 1)[1].strip()
  959. return company_name.strip()
  960. def vietnam_take_left_half(company_name: str):
  961. for separator in vietnam_left_separator_list:
  962. if separator in company_name:
  963. return company_name.rsplit(separator, 1)[0].strip()
  964. return company_name.strip()
  965. def vietnam_truncate_at_suffix(company_name: str):
  966. for suffix in vietnam_suffix_list:
  967. if suffix in company_name and company_name.endswith(suffix):
  968. return company_name.rsplit(suffix, 1)[0].strip()
  969. return company_name.strip()
  970. def india_truncate_at_suffix(text, suffix_list):
  971. for suffix in suffix_list:
  972. if suffix in text:
  973. if (
  974. suffix != ' CO' and suffix != ' INC' and suffix != ' CO LIMITED' and suffix != ' LTD'
  975. and suffix != ' LIMITED' and suffix != ' CO I' and suffix != ' I'
  976. ):
  977. return split_last(text, suffix)
  978. elif suffix == ' CO' and text.endswith(' CO'):
  979. return split_last(text, suffix)
  980. elif suffix == ' INC' and text.endswith(' INC'):
  981. return split_last(text, suffix)
  982. elif suffix == ' CO LIMITED' and ' AND CO LIMITED' not in text:
  983. return split_last(text, suffix)
  984. elif suffix == ' LTD' and text.endswith(' LTD'):
  985. return split_last(text, suffix)
  986. elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
  987. return split_last(text, suffix)
  988. elif suffix == ' CO I' and text.endswith(' CO I'):
  989. return split_last(text, suffix)
  990. elif suffix == ' I' and text.endswith(' I'):
  991. return split_last(text, suffix)
  992. return text
  993. def mexico_truncate_at_suffix(cleaned_name):
  994. for suffix in mexico_suffix_list:
  995. if suffix in cleaned_name and cleaned_name.endswith(suffix):
  996. return cleaned_name.rsplit(suffix, 1)[0].strip()
  997. return cleaned_name.strip()
  998. def nigeria_truncate_at_suffix(cleaned_name):
  999. for suffix in nigeria_suffix_list:
  1000. if cleaned_name.endswith(suffix):
  1001. return cleaned_name.rsplit(suffix, 1)[0].strip()
  1002. return cleaned_name.strip()
  1003. def peru_truncate_at_suffix(cleaned_name, peru_suffix_list):
  1004. for suffix in peru_suffix_list:
  1005. if cleaned_name.endswith(suffix):
  1006. return cleaned_name.rsplit(suffix, 1)[0].strip()
  1007. return cleaned_name.strip()
  1008. def lesotho_truncate_at_suffix(cleaned_name, lesotho_suffix_list):
  1009. for suffix in lesotho_suffix_list:
  1010. if cleaned_name.endswith(suffix):
  1011. return cleaned_name.rsplit(suffix, 1)[0].strip()
  1012. return cleaned_name.strip()
  1013. def germany_truncate_at_suffix(cleaned_name, germany_suffix_list):
  1014. for suffix in germany_suffix_list:
  1015. if cleaned_name.endswith(suffix):
  1016. return cleaned_name.rsplit(suffix, 1)[0].strip()
  1017. return cleaned_name.strip()
  1018. def america_company_abbr(company_name: str) -> str or None:
  1019. if company_name:
  1020. cleaned_name = clean_company_name(company_name)
  1021. truncated_first_name = america_truncate_at_suffix_first(cleaned_name, frist_america_suffix_list)
  1022. if len(truncated_first_name.strip()) < 8:
  1023. return cleaned_name
  1024. else:
  1025. return truncated_first_name
  1026. return None
  1027. def america_company_abbr_second(company_name: str) -> str or None:
  1028. if company_name:
  1029. cleaned_name = clean_company_name(company_name)
  1030. truncated_first_name = america_truncate_at_suffix_second(cleaned_name, SECOND_AMERICA_SUFFIX_LIST)
  1031. if len(truncated_first_name.strip()) < 5:
  1032. return cleaned_name
  1033. else:
  1034. return truncated_first_name.strip()
  1035. return None
  1036. def america_company_abbr_third(company_name: str) -> str or None:
  1037. if company_name:
  1038. cleaned_name = clean_company_name(company_name)
  1039. truncated_first_name = america_truncate_at_suffix_third(cleaned_name, third_AMERICA_SUFFIX_LIST)
  1040. if 9 < len(truncated_first_name.strip()) < 12:
  1041. return cleaned_name
  1042. elif len(truncated_first_name.strip()) <= 9:
  1043. return None
  1044. elif len(truncated_first_name.strip()) >= 12:
  1045. return truncated_first_name.strip()
  1046. return None
  1047. def bangladesh_company_abbr_first(company_name: str) -> str or None:
  1048. if company_name:
  1049. cleaned_name = clean_company_name(company_name)
  1050. truncated_first_name = bangladesh_truncate_at_suffix_first(cleaned_name, first_bangladesh_suffix_list)
  1051. return truncated_first_name.strip()
  1052. return None
  1053. def bangladesh_company_abbr_second(company_name: str) -> str or None:
  1054. if company_name:
  1055. cleaned_name = clean_company_name(company_name)
  1056. truncated_first_name = bangladesh_truncate_at_suffix_first(cleaned_name, first_bangladesh_suffix_list)
  1057. truncated_second_name = bangladesh_truncate_at_suffix_second(truncated_first_name.strip(),
  1058. second_bangladesh_suffix_list)
  1059. if len(truncated_second_name.strip()) < 6:
  1060. return truncated_first_name.strip()
  1061. else:
  1062. return truncated_second_name.strip()
  1063. return None
  1064. def chile_company_abbr(company_name: str) -> str or None:
  1065. if company_name:
  1066. cleaned_name = clean_company_name(company_name)
  1067. truncated_first_name = chile_truncate_at_suffix(cleaned_name, first_chile_SUFFIX_LIST)
  1068. if len(truncated_first_name.strip()) < 8:
  1069. return cleaned_name
  1070. else:
  1071. return truncated_first_name.strip()
  1072. return None
  1073. def rwanda_company_abbr(company_name: str) -> str or None:
  1074. if company_name:
  1075. cleaned_name = clean_company_name(company_name)
  1076. truncated_first_name = rwanda_truncate_at_suffix(cleaned_name, FIRST_Rwanda_suffix_list)
  1077. if len(truncated_first_name.strip()) < 6:
  1078. return cleaned_name
  1079. else:
  1080. return truncated_first_name.strip()
  1081. return None
  1082. def philippines_company_abbr(company_name: str) -> str or None:
  1083. if company_name:
  1084. cleaned_name = clean_company_name(company_name)
  1085. truncated_first_name = philippines_truncate_at_suffix(cleaned_name, FIRST_philippines_suffix_list)
  1086. if len(truncated_first_name.strip()) < 6:
  1087. return cleaned_name
  1088. else:
  1089. return truncated_first_name.strip()
  1090. return None
  1091. def colombia_company_abbr(company_name: str) -> str or None:
  1092. if company_name:
  1093. cleaned_name = clean_company_name(company_name)
  1094. truncated_first_name = colombia_truncate_at_suffix(cleaned_name, FIRST_colombia_suffix_list)
  1095. if len(truncated_first_name.strip()) < 6:
  1096. return cleaned_name
  1097. else:
  1098. return truncated_first_name.strip()
  1099. return None
  1100. def indonesia_company_abbr(company_name: str) -> str or None:
  1101. if company_name:
  1102. cleaned_name = clean_company_name(company_name)
  1103. truncated_name = indonesia_truncate_at_suffix(cleaned_name, indonesia_suffix_list)
  1104. if len(truncated_name.strip()) >= 8:
  1105. return truncated_name.strip()
  1106. else:
  1107. return cleaned_name
  1108. return None
  1109. def venezuela_company_abbr(company_name: str) -> str or None:
  1110. if company_name:
  1111. cleaned_name = clean_company_name(company_name)
  1112. truncated_name = venezuela_truncate_at_suffix(cleaned_name, venezuela_suffix_list)
  1113. if len(truncated_name.strip()) >= 6:
  1114. return truncated_name.strip()
  1115. else:
  1116. return cleaned_name
  1117. return None
  1118. def uzbekistan_company_abbr(company_name):
  1119. if company_name:
  1120. bak_name = company_name.upper()
  1121. company_name = kaz_extract_text_from_enclosers(bak_name)
  1122. company_name = clean_company_name(company_name)
  1123. for head in kaz_heads:
  1124. if company_name.startswith(head):
  1125. company_name = remove_prefix(company_name, head)
  1126. break
  1127. if len(company_name) < 8:
  1128. return clean_company_name(bak_name)
  1129. else:
  1130. return company_name.strip()
  1131. return None
  1132. def kazakhstan_company_abbr(company_name):
  1133. if company_name:
  1134. bak_name = company_name.upper()
  1135. company_name = kaz_extract_text_from_enclosers(bak_name)
  1136. company_name = clean_company_name(company_name)
  1137. for head in kaz_heads:
  1138. if company_name.startswith(head):
  1139. company_name = remove_prefix(company_name, head)
  1140. break
  1141. if len(company_name) < 8:
  1142. return clean_company_name(bak_name)
  1143. else:
  1144. return company_name.strip()
  1145. return None
  1146. def moldova_company_abbr(company_name: str) -> str or None:
  1147. if company_name:
  1148. cleaned_name = clean_company_name(company_name)
  1149. first_truncated_name = moldova_truncate_at_suffix(cleaned_name, moldova_suffix_list)
  1150. truncated_name = moldova_truncate_at_suffix_second(first_truncated_name, moldova_suffix_list2)
  1151. if len(truncated_name.strip()) >= 6:
  1152. return truncated_name.strip()
  1153. else:
  1154. return cleaned_name
  1155. return None
  1156. def singapore_company_abbr(company_name: str) -> str or None:
  1157. if company_name:
  1158. cleaned_name = clean_company_name(company_name)
  1159. truncated_name = singapore_truncate_at_suffix(cleaned_name, singapore_suffix_list)
  1160. if len(truncated_name.strip()) >= 8:
  1161. return truncated_name.strip()
  1162. else:
  1163. return cleaned_name
  1164. return None
  1165. def hongkong_company_abbr(company_name: str) -> str or None:
  1166. if company_name:
  1167. cleaned_name = clean_company_name(company_name)
  1168. truncated_name = hongkong_truncate_at_suffix(cleaned_name, hongkong_suffix_list)
  1169. if len(truncated_name.strip()) >= 6:
  1170. return truncated_name.strip()
  1171. else:
  1172. return cleaned_name
  1173. return None
  1174. def china_company_abbr(company_name: str) -> str or None:
  1175. if company_name:
  1176. cleaned_name = clean_company_name(company_name)
  1177. truncated_name = china_truncate_at_suffix(cleaned_name, china_suffix_list)
  1178. if len(truncated_name.strip()) >= 6:
  1179. return truncated_name.strip()
  1180. else:
  1181. return cleaned_name
  1182. return None
  1183. def vietnam_company_abbr(company_name: str) -> str or None:
  1184. if company_name:
  1185. cleaned_name = clean_company_name(company_name)
  1186. right_half = vietnam_take_right_half(cleaned_name)
  1187. left_half = vietnam_take_left_half(right_half)
  1188. truncated_name = vietnam_truncate_at_suffix(left_half)
  1189. if len(truncated_name) >= 8:
  1190. return truncated_name
  1191. else:
  1192. return cleaned_name
  1193. return None
  1194. def india_company_abbr(company_name):
  1195. if company_name:
  1196. bak_name = company_name.upper()
  1197. company_name = clean_company_name(bak_name)
  1198. for head in ind_head:
  1199. if company_name.startswith(head):
  1200. company_name = remove_prefix(company_name, head)
  1201. break
  1202. truncated_name = india_truncate_at_suffix(company_name, india_suffix_list)
  1203. if (len(truncated_name.strip()) < 8):
  1204. return clean_company_name(bak_name)
  1205. else:
  1206. return truncated_name.strip()
  1207. return None
  1208. def ukraine_truncate_at_suffix_first(text, suffix_list):
  1209. for suffix in suffix_list:
  1210. if suffix in text:
  1211. return split_last(text, suffix)
  1212. return text
  1213. def ukraine_truncate_at_suffix_second(text, suffix_list):
  1214. for suffix in suffix_list:
  1215. if suffix in text:
  1216. return split_last(text, suffix) + suffix
  1217. return text
  1218. def ukraine_company_abbr_first(company_name):
  1219. if company_name:
  1220. bak_name = company_name.upper()
  1221. truncated_name = ukraine_truncate_at_suffix_first(bak_name, ukraine_suffix_first)
  1222. return truncated_name.strip()
  1223. return None
  1224. def ukraine_company_abbr_second(company_name):
  1225. if company_name:
  1226. bak_name = company_name.upper()
  1227. truncated_name = ukraine_truncate_at_suffix_second(bak_name, ukraine_suffix_second)
  1228. return truncated_name.strip()
  1229. return None
  1230. def mexico_company_abbr(company_name):
  1231. if company_name:
  1232. cleaned_name = clean_company_name(company_name)
  1233. truncated_name = mexico_truncate_at_suffix(cleaned_name)
  1234. if len(truncated_name) >= 8:
  1235. return truncated_name
  1236. else:
  1237. return cleaned_name
  1238. return None
  1239. def nigeria_company_abbr(company_name):
  1240. if company_name:
  1241. cleaned_name = clean_company_name(company_name)
  1242. truncated_name = nigeria_truncate_at_suffix(cleaned_name)
  1243. if len(truncated_name) >= 4:
  1244. return truncated_name
  1245. else:
  1246. return cleaned_name
  1247. return None
  1248. def philippines_company_abbr_second(company_name):
  1249. if company_name:
  1250. cleaned_name = clean_company_name(company_name)
  1251. truncated_name = philippines_truncate_at_suffix(cleaned_name, FIRST_philippines_suffix_list)
  1252. if len(truncated_name) >= 6:
  1253. return truncated_name.strip()
  1254. else:
  1255. return cleaned_name
  1256. return None
  1257. def england_company_abbr(company_name):
  1258. if company_name:
  1259. cleaned_name = clean_company_name(company_name)
  1260. truncated_name = england_truncate_at_suffix(cleaned_name, FIRST_england_suffix_list)
  1261. if len(truncated_name) >= 8:
  1262. return truncated_name.strip()
  1263. else:
  1264. return cleaned_name
  1265. return None
  1266. def peru_company_abbr(company_name):
  1267. if company_name:
  1268. cleaned_name = clean_company_name(company_name)
  1269. truncated_name = peru_truncate_at_suffix(cleaned_name, peru_suffix_list)
  1270. if len(truncated_name) >= 6:
  1271. return truncated_name.strip()
  1272. else:
  1273. return cleaned_name
  1274. return None
  1275. def lesotho_company_abbr(company_name):
  1276. if company_name:
  1277. cleaned_name = clean_company_name(company_name)
  1278. truncated_name = lesotho_truncate_at_suffix(cleaned_name, lesotho_suffix_list)
  1279. if len(truncated_name) >= 6:
  1280. return truncated_name.strip()
  1281. else:
  1282. return cleaned_name
  1283. return None
  1284. def germany_company_abbr(company_name):
  1285. if company_name:
  1286. cleaned_name = clean_company_name(company_name)
  1287. truncated_name = germany_truncate_at_suffix(cleaned_name, germany_suffix_list)
  1288. if len(truncated_name) >= 8:
  1289. return truncated_name.strip()
  1290. else:
  1291. return cleaned_name
  1292. return None
  1293. def company_abbr(country_name: str, company_name: str) -> str or None:
  1294. if country_name == 'pakistan':
  1295. return pakistan_company_abbr(company_name)
  1296. if country_name == 'mirror_pakistan':
  1297. return mirror_pakistan_company_abbr(company_name)
  1298. elif country_name == 'america':
  1299. return america_company_abbr(company_name)
  1300. elif country_name == 'indonesia':
  1301. return indonesia_company_abbr(company_name)
  1302. elif country_name == 'venezuela':
  1303. return venezuela_company_abbr(company_name)
  1304. elif country_name == 'america_second':
  1305. return america_company_abbr_second(company_name)
  1306. elif country_name == 'uzbekistan':
  1307. return uzbekistan_company_abbr(company_name)
  1308. elif country_name == 'kazakhstan':
  1309. return kazakhstan_company_abbr(company_name)
  1310. elif country_name == 'chile':
  1311. return chile_company_abbr(company_name)
  1312. elif country_name == 'moldova':
  1313. return moldova_company_abbr(company_name)
  1314. elif country_name == 'bangladesh_fist':
  1315. return bangladesh_company_abbr_first(company_name)
  1316. elif country_name == 'bangladesh_second':
  1317. return bangladesh_company_abbr_second(company_name)
  1318. elif country_name == 'rwanda':
  1319. return rwanda_company_abbr(company_name)
  1320. elif country_name == 'singapore':
  1321. return singapore_company_abbr(company_name)
  1322. elif country_name == 'hongkong':
  1323. return hongkong_company_abbr(company_name)
  1324. elif country_name == 'philippines':
  1325. return philippines_company_abbr(company_name)
  1326. elif country_name == 'china':
  1327. return china_company_abbr(company_name)
  1328. elif country_name == 'vietnam':
  1329. return vietnam_company_abbr(company_name)
  1330. elif country_name == 'india':
  1331. return india_company_abbr(company_name)
  1332. elif country_name == 'ukraine_first':
  1333. return ukraine_company_abbr_first(company_name)
  1334. elif country_name == 'ukraine_second':
  1335. return ukraine_company_abbr_second(company_name)
  1336. elif country_name == 'america_third':
  1337. return america_company_abbr_third(company_name)
  1338. elif country_name == 'mexico':
  1339. return mexico_company_abbr(company_name)
  1340. elif country_name == 'colombia':
  1341. return colombia_company_abbr(company_name)
  1342. elif country_name == 'nigeria':
  1343. return nigeria_company_abbr(company_name)
  1344. elif country_name == 'philippines_second':
  1345. return philippines_company_abbr_second(company_name)
  1346. elif country_name == 'peru':
  1347. return peru_company_abbr(company_name)
  1348. elif country_name == 'lesotho':
  1349. return lesotho_company_abbr(company_name)
  1350. elif country_name == 'germany':
  1351. return germany_company_abbr(company_name)
  1352. elif country_name == 'england':
  1353. return england_company_abbr(company_name)
  1354. else:
  1355. return company_name
  1356. if __name__ == '__main__':
  1357. test_cases = [
  1358. 'Wilhelm Manz GmbH & Co. KG',
  1359. 'Wilhelm Zuleeg GmbH',
  1360. 'Aba Air Group Llc',
  1361. 'CAMUSAT (MAURICE) LIMITED',
  1362. 'BMTS Technology Austria GmbH & Co',
  1363. 'Arhetipo Grup SRL',
  1364. 'Boegli-Gravures SA',
  1365. 'Kronos International Inc.',
  1366. 'YAHO AUTO EXCHANGE CO. LIMITED',
  1367. 'Radpar Otomotiv Sanayi ve Ticaret Limited Şti.',
  1368. 'SERVICIOS INTERSEC S.A. DE C.V.',
  1369. 'PLASTIC SOLUTIONS DI MARTOCCIA CRISTIANS.A.S.',
  1370. 'C-Solution Elektrotechnik GbR',
  1371. 'Baumer Hhs S.R.L.',
  1372. 'AJH Druck & Technik Helge Klemt e.K.',
  1373. 'ADM Hamburg AG',
  1374. 'Lauer Ventilation UG',
  1375. 'Bankhaus J. Faisst OHG',
  1376. 'Continental Teves AG & Co.OHG',
  1377. 'Dow Produktions und Vertriebs GmbH & Co. OHG',
  1378. 'Springer Nature AG & Co. KGaA',
  1379. 'Paragon GmbH & Co. KGaA'
  1380. ]
  1381. for test_case in test_cases:
  1382. print("{:<50} {:>50}".format(test_case, company_abbr('germany', test_case)))
  1383. # test_cases = [
  1384. # 'COMPANY LIMITED NGOC PHAT TM',
  1385. # 'COMPANY LTD PHAM',
  1386. # 'TAIHING MOULDS CO LTD',
  1387. # 'REPRESENTATIVE OFFICE OF HETTICH SINGAPORE SEA PTE LTD IN HO CHI MINH CITY',
  1388. # 'SAI GON WASTE SOLUTION JOINT STOCK COMPANY',
  1389. # 'ENTERTAINMENT FISHING ROD IMPORT EXPORT TRADING COMPANY LIMI',
  1390. # 'TPP PLUS CORP',
  1391. # 'VILOMIX VIETNAM LLC',
  1392. # 'SMILETECH JSC',
  1393. # 'DUC MINH CTI CO JSC',
  1394. # 'SMILETECH JSC',
  1395. # 'HUVICO LTD'
  1396. # ]
  1397. # for test in test_cases:
  1398. # print(vietnam_company_abbr(test))