cpc_clean_udf.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. # encoding: utf8
  2. import re
  3. from inflect_udf import phrase_singular
  4. COMMA_STR = ","
  5. PIPE_SYMBOL = "|"
  6. # 是否包含数字和逗号 - 等
  7. chemical_re_1 = re.compile(r'\d+([.\']\d*)?\s*([,-])\s*\d+([.\']\d*)?')
  8. # 是否包含化学词汇
  9. chemical_re_2 = re.compile(r'ETHYL|ACID|AMINE|SALT|DIOXANE|AMINO')
  10. # 是否只有数字和逗号
  11. chemical_re_3 = re.compile(r'^[\d,]*$')
  12. def is_chemical_expression(word):
  13. if bool(chemical_re_3.match(word)):
  14. return False
  15. # 检查是否包含数字和逗号
  16. has_digits_and_commas = bool(chemical_re_1.search(word))
  17. # 检查是否包含一些特定的化学词汇
  18. has_chemical_word = bool(chemical_re_2.search(word))
  19. result = (has_digits_and_commas or has_chemical_word)
  20. return result
  21. # 中文字符
  22. chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
  23. # 数字
  24. number_pattern = re.compile(r'^-?\d+(\.\d+)?$')
  25. # 包含数字
  26. digit_pattern = re.compile(r'\d')
  27. # 保留词
  28. need_str_pattern = re.compile(r'(^3D)|DDR|VITAMIN|CONNECTOR|MP3|LAPTOP|RYZEN|INTEL|PHONE|NYLON|COVID|CAT|CABLE')
  29. # 切分符
  30. split_pattern = re.compile(r'\s*,\s*(?:OF|FOR)|\s*[,;&]\s*')
  31. # 特殊符号
  32. special_chars_pattern = re.compile(r'[¥$#~!!=??@><》《{}【】]')
  33. # 不包含空格和逗号,包含-得特殊化学
  34. no_space_pattern = re.compile(r'^(?!.*[ ,])(?=.*-).*$')
  35. # 需要保留的短字符产品词
  36. SHORT_CHAR_CPC = {
  37. "TV", "CD", "PC", "TF", "MP", "SD", "VR", "MR", "IC", "PU"
  38. }
  39. def contains_chinese(text):
  40. # 匹配中文字符的正则表达式
  41. return bool(chinese_char_pattern.search(text))
  42. def is_number(s):
  43. # 匹配整数、小数和负数
  44. return bool(number_pattern.match(s))
  45. def contains_digit(s):
  46. # 匹配字符串中是否包含数字
  47. return bool(digit_pattern.search(s))
  48. def need_str(s):
  49. # 匹配是否是保留词
  50. return bool(need_str_pattern.search(s))
  51. def special_char_remove(cpc):
  52. # 提取特殊字符产品词数据
  53. return bool(special_chars_pattern.search(cpc))
  54. def no_space(cpc):
  55. return bool(no_space_pattern.search(cpc))
  56. def is_invalid_cpc(word):
  57. """
  58. 判断一个产品词是否是不合法的产品词
  59. """
  60. if (is_number(word) # 纯数字
  61. or (len(word.replace('.', '')) < 3 and word not in SHORT_CHAR_CPC) # 长度<=3,且不是例举产品词的
  62. or contains_chinese(word) # 包含中文的
  63. or special_char_remove(word)): # 包含特殊字符的
  64. return True
  65. def multi_cpc_clean(cpc, force=False):
  66. if cpc is None:
  67. return None
  68. cpc = cpc.strip()
  69. if cpc == '':
  70. return None
  71. if COMMA_STR not in cpc and PIPE_SYMBOL not in cpc:
  72. if is_invalid_cpc(cpc):
  73. return None
  74. if is_chemical_expression(cpc):
  75. return cpc
  76. if not contains_digit(cpc) or need_str(cpc) or no_space(cpc):
  77. return phrase_singular(cpc)
  78. else:
  79. return None
  80. cpc_list = []
  81. # 先按管道符切分
  82. for cpc_i in cpc.split("|"):
  83. cpc_i = cpc_i.strip()
  84. # 是否是化学表达式
  85. if is_chemical_expression(cpc_i):
  86. cpc_list.append(
  87. phrase_singular(cpc_i)
  88. )
  89. continue
  90. # cpc中包含多种产品
  91. for cpc_j in re.split(split_pattern, cpc_i):
  92. cpc_j = cpc_j.strip()
  93. if is_invalid_cpc(cpc_j):
  94. continue
  95. cpc_j = phrase_singular(cpc_j)
  96. if cpc_j not in cpc_list:
  97. if not contains_digit(cpc_j) or need_str(cpc_j):
  98. cpc_list.append(cpc_j)
  99. if len(cpc_list) == 0:
  100. return None
  101. else:
  102. return " | ".join(cpc_list)
  103. if __name__ == '__main__':
  104. # print(multi_cpc_clean('SEMICARBAZIDE-13C, 15N2 HYDROCHLORIDE'))
  105. print(multi_cpc_clean('FAN'))
  106. # print(multi_cpc_clean('4-METHYLBENZALDEHYDE'))
  107. # print(multi_cpc_clean('1-ALLYL-2-THIOUREA'))
  108. # print(multi_cpc_clean('1HHH'))
  109. # print(multi_cpc_clean('BALL OR ROLLER BEARINGS'))
  110. # print(special_char_remove("AAAA》hhhh"))