| 12345678910111213141516171819202122232425262728293031323334 |
- import json
- import requests
- from pyspark.sql.functions import udf
- from pyspark.sql.types import StringType, ArrayType, StructType, StructField, BooleanType
- def edismax_call(collection: str, q_alt: str, q: str, qf: str, mm: str = '70%', rows: int = 1, stopwords: str = 'true',
- tie: float = 0.2, wt: str = 'json'):
- def_type: str = 'edismax'
- params = {"defType": def_type, "mm": mm, "q.alt": q_alt, "q": q, 'qf': qf, 'rows': rows, 'stopwords': stopwords,
- 'tie': tie, 'wt': wt}
- resp = requests.get(f'http://m2.node.dev:8886/solr/{collection}/select', params=params)
- return resp
- @udf(returnType=StructType([
- StructField("is_finded",BooleanType(),False),
- StructField("basic_arr",ArrayType(StringType()),True)
- ]))
- def get_china_company_name_match(raw_name:str,mm:str = '70%', rows: int = 1):
- solr_resp = edismax_call('ent_china_biz_basic', raw_name,
- raw_name, 'ent_name_en_abb^1.0',mm,rows)
- if solr_resp.status_code != 200:
- return False, None
- else:
- resp = json.loads(solr_resp.text)['response']
- if resp['numFound'] == 0:
- return False, None
- else:
- most_match_one = resp['docs'][0]
- return True, [most_match_one['ent_name_chn'],most_match_one['ent_name_en'],most_match_one['ent_name_en_abb'],most_match_one['unc_id']]
- if __name__ == '__main__':
- print(get_china_company_name_match('SAMSUNG ELECTRONICS CO. LTD.,. '))
|