#!/usr/bin/env /usr/bin/python3 # -*- coding:utf-8 -*- import hashlib import re # 企业库唯一性调整,离线数据udf from datetime import datetime from dw_base.spark.udf.customs.common_clean import clean_company_name turkey_replace_dict = { 'ç': 'c', 'Ç': 'C', 'ğ': 'g', 'Ğ': 'G', 'ı': 'i', 'İ': 'I', 'ö': 'o', 'Ö': 'O', 'ş': 's', 'Ş': 'S', 'ü': 'u', 'Ü': 'U' } def replace_str_english(text: str) -> str or None: if text: return text.translate(str.maketrans(turkey_replace_dict)) return None def generate_md5_hash(input_str: str): input_data = input_str.encode('utf-8') md5_hash = hashlib.md5() md5_hash.update(input_data) return md5_hash.hexdigest() def generate_tid_tur(company_name: str, business_number: str) -> str or None: if not company_name: return None if business_number: input_str = business_number + 'AAA' else: input_str = company_name + 'BBB' return 'TUR' + generate_md5_hash(input_str) def clean_company_name_extra(s: str) -> str or None: if s: suffixes = [ "Tahmini Anonim Sirket", "Anonim Sirket", "Halka Acık Sirket", "Komandit Sirket", "Limited Sirket", "Baslangıc Sirket", "Tahmini Anonim Sirketi", "Anonim Sirketi", "Halka Acık Sirketi", "Komandit Sirketi", "Limited Sirketi", "Baslangıc Sirketi", " LTD Sti", " T A S", " A S", " H S", " K S", " B S"] # 去除后缀 for suffix in suffixes: if s.endswith(suffix.upper()): s = s[:-len(suffix)] break # 去除字符串前后的空格 s = s.strip() return s def clean_company_name_tur(company_name: str) -> str or None: if company_name: company_name = replace_str_english(company_name) name = clean_company_name(company_name) if name: name = clean_company_name_extra(name) return name return None if __name__ == '__main__': name = 'KBR İNŞAAT METAL VE ELEKTRİK SANAYİ TİCARET LİMİTED ŞİRKETİ' print(clean_company_name_tur(name)) pass