zhongjian_spider.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/2/17 16:22
  5. import inspect
  6. import requests
  7. import user_agent
  8. from loguru import logger
  9. from tenacity import retry, stop_after_attempt, wait_fixed
  10. from mysq_pool import MySQLConnectionPool
  11. logger.remove()
  12. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  13. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  14. level="DEBUG", retention="7 day")
  15. def after_log(retry_state):
  16. """
  17. retry 回调
  18. :param retry_state: RetryCallState 对象
  19. """
  20. # 检查 args 是否存在且不为空
  21. if retry_state.args and len(retry_state.args) > 0:
  22. log = retry_state.args[0] # 获取传入的 logger
  23. else:
  24. log = logger # 使用全局 logger
  25. if retry_state.outcome.failed:
  26. log.warning(
  27. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  28. else:
  29. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  30. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  31. def get_proxys(log):
  32. """
  33. 获取代理
  34. :return: 代理
  35. """
  36. tunnel = "x371.kdltps.com:15818"
  37. kdl_username = "t13753103189895"
  38. kdl_password = "o0yefv6z"
  39. try:
  40. proxies = {
  41. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
  42. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
  43. }
  44. return proxies
  45. except Exception as e:
  46. log.error(f"Error getting proxy: {e}")
  47. raise e
  48. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  49. def get_request_one_page(log, rating_no) -> dict:
  50. headers = {
  51. "accept": "*/*",
  52. "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
  53. "content-type": "application/json;charset=UTF-8",
  54. "origin": "https://www.zhongjianjiantong.com",
  55. "priority": "u=1, i",
  56. "referer": "https://www.zhongjianjiantong.com/web/index.html",
  57. "sec-ch-ua": "\"Not(A:Brand\";v=\"99\", \"Google Chrome\";v=\"133\", \"Chromium\";v=\"133\"",
  58. "sec-ch-ua-mobile": "?1",
  59. "sec-ch-ua-platform": "\"Android\"",
  60. "sec-fetch-dest": "empty",
  61. "sec-fetch-mode": "cors",
  62. "sec-fetch-site": "same-origin",
  63. "user-agent": user_agent.generate_user_agent()
  64. }
  65. url = "https://www.zhongjianjiantong.com/Api/OrderRatingGoods/detail"
  66. data = {
  67. "rating_no": rating_no
  68. }
  69. try:
  70. with requests.Session() as session:
  71. response = session.post(url, headers=headers, json=data, proxies=get_proxys(log), timeout=5)
  72. # print(response.text)
  73. response.raise_for_status()
  74. return response.json()
  75. except Exception as e:
  76. log.warning(f"{inspect.currentframe().f_code.co_name} error: {e}")
  77. return {}
  78. def parse_data(resp_json, sql_pool):
  79. card_id = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('id')
  80. order_no = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('order_no')
  81. tag_no = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('tag_no') # 标签号/查询的号码
  82. images = resp_json.get('data', {}).get('obj_order_rating_goods', []).get('images')
  83. card_create_time = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('create_time')
  84. card_update_time = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('update_time')
  85. score = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('score') # 中检评分
  86. corners = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get('corners') # 四角
  87. eoges = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get('eoges') # 边缘
  88. surface = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get('surface') # 表面
  89. centering = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get('centering') # 居中
  90. colour = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get('colour') # 颜色
  91. repair = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get('repair') # 修复
  92. rating_no = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('rating_no') # 证书编号
  93. obj_brand_title = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_brand', {}).get(
  94. 'title') # 商品品牌
  95. obj_detail_spxl = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get(
  96. 'spxl') # 商品系列
  97. obj_detail_spmc = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get(
  98. 'spmc') # 商品名称
  99. obj_detail_fxnf = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get(
  100. 'fxnf') # 发行年份
  101. obj_detail_yy = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get('yy') # 语言
  102. obj_detail_spbh = resp_json.get('data', {}).get('obj_order_rating_goods', {}).get('obj_detail', {}).get(
  103. 'spbh') # 商品编号
  104. info = (
  105. card_id, order_no, tag_no, images, card_create_time, card_update_time, score, corners, eoges, surface,
  106. centering,
  107. colour, repair, rating_no, obj_brand_title, obj_detail_spxl, obj_detail_spmc, obj_detail_fxnf, obj_detail_yy,
  108. obj_detail_spbh)
  109. sql = """
  110. INSERT INTO zhongjian_record (card_id, order_no, tag_no, images, card_create_time, card_update_time, score, corners, eoges, surface, centering, colour, repair, rating_no, obj_brand_title, obj_detail_spxl, obj_detail_spmc, obj_detail_fxnf, obj_detail_yy, obj_detail_spbh)
  111. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  112. """
  113. sql_pool.insert_one(sql, info)
  114. def control_rating_no(log, sql_pool):
  115. # rating_no_ = '519531553'
  116. for i in range(871527, 1000000):
  117. rating_no_ = f'519{i:06}'
  118. try:
  119. log.info(f"{rating_no_} is start ......................................")
  120. resp_json = get_request_one_page(log, rating_no_)
  121. if resp_json and resp_json.get('code') == 200:
  122. # print(resp_json)
  123. parse_data(resp_json, sql_pool)
  124. elif resp_json and resp_json.get('code') == 400:
  125. log.warning(f"{rating_no_} is not exist ......................................")
  126. else:
  127. log.warning(f"other warning, please check ......................................")
  128. except Exception as e:
  129. log.warning(f"{inspect.currentframe().f_code.co_name} error: {e}")
  130. continue
  131. @retry(stop=stop_after_attempt(50), wait=wait_fixed(600), after=after_log)
  132. def zhongjian_main(log):
  133. """
  134. 主函数
  135. :param log:
  136. """
  137. log.info(
  138. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  139. # 配置 MySQL 连接池
  140. sql_pool = MySQLConnectionPool(log=log)
  141. if not sql_pool:
  142. log.error("MySQL数据库连接失败")
  143. raise Exception("MySQL数据库连接失败")
  144. try:
  145. control_rating_no(log, sql_pool)
  146. except Exception as e:
  147. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  148. finally:
  149. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  150. if __name__ == '__main__':
  151. zhongjian_main(logger)