| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 |
- # encoding: utf8
- import re
- from inflect_udf import phrase_singular
- COMMA_STR = ","
- PIPE_SYMBOL = "|"
- # 是否包含数字和逗号 - 等
- chemical_re_1 = re.compile(r'\d+([.\']\d*)?\s*([,-])\s*\d+([.\']\d*)?')
- # 是否包含化学词汇
- chemical_re_2 = re.compile(r'ETHYL|ACID|AMINE|SALT|DIOXANE|AMINO')
- # 是否只有数字和逗号
- chemical_re_3 = re.compile(r'^[\d,]*$')
- def is_chemical_expression(word):
- if bool(chemical_re_3.match(word)):
- return False
- # 检查是否包含数字和逗号
- has_digits_and_commas = bool(chemical_re_1.search(word))
- # 检查是否包含一些特定的化学词汇
- has_chemical_word = bool(chemical_re_2.search(word))
- result = (has_digits_and_commas or has_chemical_word)
- return result
- # 中文字符
- chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
- # 数字
- number_pattern = re.compile(r'^-?\d+(\.\d+)?$')
- # 包含数字
- digit_pattern = re.compile(r'\d')
- # 保留词
- need_str_pattern = re.compile(r'(^3D)|DDR|VITAMIN|CONNECTOR|MP3|LAPTOP|RYZEN|INTEL|PHONE|NYLON|COVID|CAT|CABLE')
- # 切分符
- split_pattern = re.compile(r'\s*,\s*(?:OF|FOR)|\s*[,;&]\s*')
- # 特殊符号
- special_chars_pattern = re.compile(r'[¥$#~!!=??@><》《{}【】]')
- # 不包含空格和逗号,包含-得特殊化学
- no_space_pattern = re.compile(r'^(?!.*[ ,])(?=.*-).*$')
- # 需要保留的短字符产品词
- SHORT_CHAR_CPC = {
- "TV", "CD", "PC", "TF", "MP", "SD", "VR", "MR", "IC", "PU"
- }
- def contains_chinese(text):
- # 匹配中文字符的正则表达式
- return bool(chinese_char_pattern.search(text))
- def is_number(s):
- # 匹配整数、小数和负数
- return bool(number_pattern.match(s))
- def contains_digit(s):
- # 匹配字符串中是否包含数字
- return bool(digit_pattern.search(s))
- def need_str(s):
- # 匹配是否是保留词
- return bool(need_str_pattern.search(s))
- def special_char_remove(cpc):
- # 提取特殊字符产品词数据
- return bool(special_chars_pattern.search(cpc))
- def no_space(cpc):
- return bool(no_space_pattern.search(cpc))
- def is_invalid_cpc(word):
- """
- 判断一个产品词是否是不合法的产品词
- """
- if (is_number(word) # 纯数字
- or (len(word.replace('.', '')) < 3 and word not in SHORT_CHAR_CPC) # 长度<=3,且不是例举产品词的
- or contains_chinese(word) # 包含中文的
- or special_char_remove(word)): # 包含特殊字符的
- return True
- def multi_cpc_clean(cpc, force=False):
- if cpc is None:
- return None
- cpc = cpc.strip()
- if cpc == '':
- return None
- if COMMA_STR not in cpc and PIPE_SYMBOL not in cpc:
- if is_invalid_cpc(cpc):
- return None
- if is_chemical_expression(cpc):
- return cpc
- if not contains_digit(cpc) or need_str(cpc) or no_space(cpc):
- return phrase_singular(cpc)
- else:
- return None
- cpc_list = []
- # 先按管道符切分
- for cpc_i in cpc.split("|"):
- cpc_i = cpc_i.strip()
- # 是否是化学表达式
- if is_chemical_expression(cpc_i):
- cpc_list.append(
- phrase_singular(cpc_i)
- )
- continue
- # cpc中包含多种产品
- for cpc_j in re.split(split_pattern, cpc_i):
- cpc_j = cpc_j.strip()
- if is_invalid_cpc(cpc_j):
- continue
- cpc_j = phrase_singular(cpc_j)
- if cpc_j not in cpc_list:
- if not contains_digit(cpc_j) or need_str(cpc_j):
- cpc_list.append(cpc_j)
- if len(cpc_list) == 0:
- return None
- else:
- return " | ".join(cpc_list)
- if __name__ == '__main__':
- # print(multi_cpc_clean('SEMICARBAZIDE-13C, 15N2 HYDROCHLORIDE'))
- print(multi_cpc_clean('FAN'))
- # print(multi_cpc_clean('4-METHYLBENZALDEHYDE'))
- # print(multi_cpc_clean('1-ALLYL-2-THIOUREA'))
- # print(multi_cpc_clean('1HHH'))
- # print(multi_cpc_clean('BALL OR ROLLER BEARINGS'))
- # print(special_char_remove("AAAA》hhhh"))
|