import json import requests from pyspark.sql.functions import udf from pyspark.sql.types import StringType, ArrayType, StructType, StructField, BooleanType def edismax_call(collection: str, q_alt: str, q: str, qf: str, mm: str = '70%', rows: int = 1, stopwords: str = 'true', tie: float = 0.2, wt: str = 'json'): def_type: str = 'edismax' params = {"defType": def_type, "mm": mm, "q.alt": q_alt, "q": q, 'qf': qf, 'rows': rows, 'stopwords': stopwords, 'tie': tie, 'wt': wt} resp = requests.get(f'http://m2.node.dev:8886/solr/{collection}/select', params=params) return resp @udf(returnType=StructType([ StructField("is_finded",BooleanType(),False), StructField("basic_arr",ArrayType(StringType()),True) ])) def get_china_company_name_match(raw_name:str,mm:str = '70%', rows: int = 1): solr_resp = edismax_call('ent_china_biz_basic', raw_name, raw_name, 'ent_name_en_abb^1.0',mm,rows) if solr_resp.status_code != 200: return False, None else: resp = json.loads(solr_resp.text)['response'] if resp['numFound'] == 0: return False, None else: most_match_one = resp['docs'][0] return True, [most_match_one['ent_name_chn'],most_match_one['ent_name_en'],most_match_one['ent_name_en_abb'],most_match_one['unc_id']] if __name__ == '__main__': print(get_china_company_name_match('SAMSUNG ELECTRONICS CO. LTD.,. '))