# encoding: utf8 import re from inflect_udf import phrase_singular COMMA_STR = "," PIPE_SYMBOL = "|" # 是否包含数字和逗号 - 等 chemical_re_1 = re.compile(r'\d+([.\']\d*)?\s*([,-])\s*\d+([.\']\d*)?') # 是否包含化学词汇 chemical_re_2 = re.compile(r'ETHYL|ACID|AMINE|SALT|DIOXANE|AMINO') # 是否只有数字和逗号 chemical_re_3 = re.compile(r'^[\d,]*$') def is_chemical_expression(word): if bool(chemical_re_3.match(word)): return False # 检查是否包含数字和逗号 has_digits_and_commas = bool(chemical_re_1.search(word)) # 检查是否包含一些特定的化学词汇 has_chemical_word = bool(chemical_re_2.search(word)) result = (has_digits_and_commas or has_chemical_word) return result # 中文字符 chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') # 数字 number_pattern = re.compile(r'^-?\d+(\.\d+)?$') # 包含数字 digit_pattern = re.compile(r'\d') # 保留词 need_str_pattern = re.compile(r'(^3D)|DDR|VITAMIN|CONNECTOR|MP3|LAPTOP|RYZEN|INTEL|PHONE|NYLON|COVID|CAT|CABLE') # 切分符 split_pattern = re.compile(r'\s*,\s*(?:OF|FOR)|\s*[,;&]\s*') # 特殊符号 special_chars_pattern = re.compile(r'[¥$#~!!=??@><》《{}【】]') # 不包含空格和逗号,包含-得特殊化学 no_space_pattern = re.compile(r'^(?!.*[ ,])(?=.*-).*$') # 需要保留的短字符产品词 SHORT_CHAR_CPC = { "TV", "CD", "PC", "TF", "MP", "SD", "VR", "MR", "IC", "PU" } def contains_chinese(text): # 匹配中文字符的正则表达式 return bool(chinese_char_pattern.search(text)) def is_number(s): # 匹配整数、小数和负数 return bool(number_pattern.match(s)) def contains_digit(s): # 匹配字符串中是否包含数字 return bool(digit_pattern.search(s)) def need_str(s): # 匹配是否是保留词 return bool(need_str_pattern.search(s)) def special_char_remove(cpc): # 提取特殊字符产品词数据 return bool(special_chars_pattern.search(cpc)) def no_space(cpc): return bool(no_space_pattern.search(cpc)) def is_invalid_cpc(word): """ 判断一个产品词是否是不合法的产品词 """ if (is_number(word) # 纯数字 or (len(word.replace('.', '')) < 3 and word not in SHORT_CHAR_CPC) # 长度<=3,且不是例举产品词的 or contains_chinese(word) # 包含中文的 or special_char_remove(word)): # 包含特殊字符的 return True def multi_cpc_clean(cpc, force=False): if cpc is None: return None cpc = cpc.strip() if cpc == '': return None if COMMA_STR not in cpc and PIPE_SYMBOL not in cpc: if is_invalid_cpc(cpc): return None if is_chemical_expression(cpc): return cpc if not contains_digit(cpc) or need_str(cpc) or no_space(cpc): return phrase_singular(cpc) else: return None cpc_list = [] # 先按管道符切分 for cpc_i in cpc.split("|"): cpc_i = cpc_i.strip() # 是否是化学表达式 if is_chemical_expression(cpc_i): cpc_list.append( phrase_singular(cpc_i) ) continue # cpc中包含多种产品 for cpc_j in re.split(split_pattern, cpc_i): cpc_j = cpc_j.strip() if is_invalid_cpc(cpc_j): continue cpc_j = phrase_singular(cpc_j) if cpc_j not in cpc_list: if not contains_digit(cpc_j) or need_str(cpc_j): cpc_list.append(cpc_j) if len(cpc_list) == 0: return None else: return " | ".join(cpc_list) if __name__ == '__main__': # print(multi_cpc_clean('SEMICARBAZIDE-13C, 15N2 HYDROCHLORIDE')) print(multi_cpc_clean('FAN')) # print(multi_cpc_clean('4-METHYLBENZALDEHYDE')) # print(multi_cpc_clean('1-ALLYL-2-THIOUREA')) # print(multi_cpc_clean('1HHH')) # print(multi_cpc_clean('BALL OR ROLLER BEARINGS')) # print(special_char_remove("AAAA》hhhh"))