| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- import re
- special_chars = ['.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '‘',
- '’',
- '“',
- '”',
- '`',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '《',
- '》',
- '<',
- '>',
- '%',
- '^',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\\',
- '~',
- '=',
- '\'',
- '±',
- '°',
- '«',
- '»',
- 'µ',
- '¶',
- '·',
- '€',
- '£',
- '¥',
- '¢',
- '×',
- '÷',
- '¬',
- '…',
- '→',
- '←',
- '↑',
- '↓',
- '↔',
- '⇒',
- '⇐',
- '≈',
- '≠',
- '≤',
- '≥',
- '.',
- ',',
- '-',
- '(',
- ')',
- '@',
- '?',
- '"',
- '\'',
- '#',
- '+',
- '!',
- '$',
- '|',
- ':',
- '/',
- ';',
- '*',
- '<',
- '>',
- '%',
- '^',
- '_',
- '[',
- ']',
- '{',
- '}',
- '\',
- '~',
- '¨',
- '´',
- '',
- '¿',
- '‰',
- '¯',
- '\x1A',
- '£',
- '>',
- '¿',
- '«',
- '´',
- '»',
- '°',
- '®',
- '·',
- '¼',
- '©',
- '¶',
- "'",
- '"'
- ]
- special_char_dict = {c: ' ' for c in set(special_chars)}
- special_char_dict['&'] = ' and '
- special_char_dict['&'] = ' and '
- special_chars_trans = str.maketrans(special_char_dict)
- ind_head = [
- 'THE ',
- 'M S',
- 'MS'
- ]
- india_suffix_list = [
- ' PRIVATELIMITED',
- ' LLP',
- ' CO I PVT L',
- ' CO PVT L',
- ' CO PRIVATE L',
- ' CO I LTD',
- ' I LTD',
- ' I LIMITED',
- ' I PVT L',
- ' I PRIVATE L',
- ' COMPANY PRIVATE L',
- ' COMPANY PVT L',
- ' P LTD',
- ' PRIVATE L',
- ' PVT L',
- ' CO LTD',
- ' CO',
- ' INC',
- ' CO LIMITED',
- ' LTD',
- ' LIMITED',
- ' CO I',
- ' I'
- ]
- def clean_company_name(name):
- if name:
- # 特殊字符替换为空格
- name = name.translate(special_chars_trans)
- # 转大写,去除连续空格,去除首尾空格
- name = ' '.join(name.upper().split())
- return name
- else:
- return None
- def split_last(text, suffix):
- if text:
- last_occurrence_index = text.rfind(suffix)
- if last_occurrence_index != -1:
- return text[:last_occurrence_index]
- return text
- return None
- def india_truncate_at_suffix(text, suffix_list):
- for suffix in suffix_list:
- if suffix in text:
- if (
- suffix != ' CO' and suffix != ' INC' and suffix != ' CO LIMITED' and suffix != ' LTD'
- and suffix != ' LIMITED' and suffix != ' CO I' and suffix != ' I'
- ):
- return split_last(text, suffix)
- elif suffix == ' CO' and text.endswith(' CO'):
- return split_last(text, suffix)
- elif suffix == ' INC' and text.endswith(' INC'):
- return split_last(text, suffix)
- elif suffix == ' CO LIMITED' and ' AND CO LIMITED' not in text:
- return split_last(text, suffix)
- elif suffix == ' LTD' and text.endswith(' LTD'):
- return split_last(text, suffix)
- elif suffix == ' LIMITED' and text.endswith(' LIMITED'):
- return split_last(text, suffix)
- elif suffix == ' CO I' and text.endswith(' CO I'):
- return split_last(text, suffix)
- elif suffix == ' I' and text.endswith(' I'):
- return split_last(text, suffix)
- return text
- def remove_prefix(text, prefix):
- if text.startswith(prefix):
- return text[len(prefix):]
- return text
- def india_company_abbr(company_name):
- if company_name:
- bak_name = company_name.upper()
- # remove_dots_name = remove_dots_from_abbr(bak_name)
- company_name = clean_company_name(bak_name)
- for head in ind_head:
- if company_name.startswith(head):
- company_name = remove_prefix(company_name, head)
- break
- truncated_name = india_truncate_at_suffix(company_name, india_suffix_list)
- if (len(truncated_name.strip()) < 8):
- return clean_company_name(bak_name)
- else:
- return truncated_name.strip()
- return None
- def company_abbr(country_name: str, company_name: str) -> str or None:
- if country_name == 'india':
- return india_company_abbr(company_name)
- def remove_dots_from_abbr(text):
- # 定义正则表达式模式
- pattern = r'(([A-Z]\.)+) .*'
- # 先检查字符串是否符合模式
- match = re.search(pattern, text)
- if match:
- # 如果符合,则提取匹配的部分,并去掉点
- matched_text = match.group(1)
- # 去掉匹配部分中的点
- modified_text = matched_text.replace('.', '')
- # 用修改后的部分替换原始匹配部分
- result = text.replace(matched_text, modified_text)
- return result
- else:
- # 如果不符合,返回原始字符串
- return text
- if __name__ == '__main__':
- # 示例用法
- case_list = ['X.X. XXXXXX',
- 'A.A.A. some text B.B.B.B. more text',
- 'X.X.X. XXXXXX',
- 'K.N. TEXFAB',
- 'AAKASH OIL FIELD SERVICES PVT.LTD.',
- 'PARVEEN TRADING CO.',
- 'KONNET SOLUTIONS PVT. LTD.',
- 'DAINICHI COLOR INDIA PVT.LTD.',
- 'NOVA IRON & STEEL LTD.',
- 'RPA COPPER DISTRIBUTORS PVT.LTD.',
- 'DURA AUTO SYSTEMS INDIA PV.LTD.',
- 'SPG CORPORATION PVT.LTD.',
- 'MESSRS.K. KRISHNAMURTHY BOOKS & PERIODICALS',
- 'MALHAR FASHIONS (INDIA) PVT. LTD.',
- 'ELITE BREADS PVT. LTD',
- 'MINILEC INDIA PVT.LTD.',
- 'CALISTA PROPERTIES PVT.LTD.',
- 'PRADIP ENTERPRISES LTD.',
- 'ESTEE AUTO PRESSINGS PRIVATE LTD.',
- 'DR.(MS)BUNTY M.JAVA',
- 'INDUSTRADE(PROP.PHADKE SANJAY ARAVIND)',
- 'LEDER FX.',
- 'PINNACLE TELE SERVICES PVT. LTD.',
- 'HARIBHARAT EQUIPMENTS PVT.LTD.',
- 'CECáINTERNATIONALáCORPORATIONá(I)áPVT.áLTD.',
- 'BRUNOS COMPUTER SOLUTIONS & SOFTWARE PVT. LTD.',
- 'DREAMS ENTERPRISES.',
- 'SKR FOODS PVT. LTD.',
- ]
- for case in case_list:
- print(case + " ===> " + company_abbr('india', case))
|