| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- #!/usr/bin/env /usr/bin/python3
- # -*- coding:utf-8 -*-
- import hashlib
- import re
- # 企业库唯一性调整,离线数据udf
- from datetime import datetime
- from dw_base.spark.udf.customs.common_clean import clean_company_name
- turkey_replace_dict = {
- 'ç': 'c', 'Ç': 'C',
- 'ğ': 'g', 'Ğ': 'G',
- 'ı': 'i', 'İ': 'I',
- 'ö': 'o', 'Ö': 'O',
- 'ş': 's', 'Ş': 'S',
- 'ü': 'u', 'Ü': 'U'
- }
- def replace_str_english(text: str) -> str or None:
- if text:
- return text.translate(str.maketrans(turkey_replace_dict))
- return None
- def generate_md5_hash(input_str: str):
- input_data = input_str.encode('utf-8')
- md5_hash = hashlib.md5()
- md5_hash.update(input_data)
- return md5_hash.hexdigest()
- def generate_tid_tur(company_name: str,
- business_number: str) -> str or None:
- if not company_name:
- return None
- if business_number:
- input_str = business_number + 'AAA'
- else:
- input_str = company_name + 'BBB'
- return 'TUR' + generate_md5_hash(input_str)
- def clean_company_name_extra(s: str) -> str or None:
- if s:
- suffixes = [
- "Tahmini Anonim Sirket",
- "Anonim Sirket",
- "Halka Acık Sirket",
- "Komandit Sirket",
- "Limited Sirket",
- "Baslangıc Sirket",
- "Tahmini Anonim Sirketi",
- "Anonim Sirketi",
- "Halka Acık Sirketi",
- "Komandit Sirketi",
- "Limited Sirketi",
- "Baslangıc Sirketi",
- " LTD Sti",
- " T A S",
- " A S",
- " H S",
- " K S",
- " B S"]
- # 去除后缀
- for suffix in suffixes:
- if s.endswith(suffix.upper()):
- s = s[:-len(suffix)]
- break
- # 去除字符串前后的空格
- s = s.strip()
- return s
- def clean_company_name_tur(company_name: str) -> str or None:
- if company_name:
- company_name = replace_str_english(company_name)
- name = clean_company_name(company_name)
- if name:
- name = clean_company_name_extra(name)
- return name
- return None
- if __name__ == '__main__':
- name = 'KBR İNŞAAT METAL VE ELEKTRİK SANAYİ TİCARET LİMİTED ŞİRKETİ'
- print(clean_company_name_tur(name))
- pass
|