gbca_spider.py 11 KB


  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/2/24 18:34
  5. import pytz
  6. import inspect
  7. import requests
  8. import user_agent
  9. from loguru import logger
  10. from datetime import datetime
  11. from mysq_pool import MySQLConnectionPool
  12. from tenacity import retry, stop_after_attempt, wait_fixed
  13. logger.remove()
  14. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  15. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  16. level="DEBUG", retention="15 day")
  17. def after_log(retry_state):
  18. """
  19. retry 回调
  20. :param retry_state: RetryCallState 对象
  21. """
  22. # 检查 args 是否存在且不为空
  23. if retry_state.args and len(retry_state.args) > 0:
  24. log = retry_state.args[0] # 获取传入的 logger
  25. else:
  26. log = logger # 使用全局 logger
  27. if retry_state.outcome.failed:
  28. log.warning(
  29. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  30. else:
  31. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  32. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  33. def get_proxys(log):
  34. """
  35. 获取代理
  36. :return: 代理
  37. """
  38. tunnel = "x371.kdltps.com:15818"
  39. kdl_username = "t13753103189895"
  40. kdl_password = "o0yefv6z"
  41. try:
  42. proxies = {
  43. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
  44. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
  45. }
  46. return proxies
  47. except Exception as e:
  48. log.error(f"Error getting proxy: {e}")
  49. raise e
  50. def save_data(sql_pool, info):
  51. """
  52. 保存数据
  53. :param sql_pool: sql连接池对象
  54. :param info: 保存的数据 -> tuple
  55. """
  56. sql = """
  57. INSERT INTO gbca_record (rating_code, front_img, back_img, company_short_name, goods_name, goods_score_name, year, publisher, brand, sub_brand, card_no, middle_score, border_score, card_angle_score, surface_score, sign_score, issue_limit, category2, company_id, create_time, update_time, order_code, card_id)
  58. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
  59. sql_pool.insert_one(sql, info)
  60. def transfer_ts(timestamp_ms) -> str:
  61. """
  62. 转换时间戳 -> 1615975247000
  63. :param timestamp_ms:
  64. :return: ret_ts -> str
  65. """
  66. # 将毫秒转换为秒
  67. timestamp_s = timestamp_ms / 1000.0
  68. # 创建 UTC 时间
  69. utc_dt = datetime.fromtimestamp(timestamp_s, pytz.utc)
  70. # 需要转换到特定时区(例如 'Asia/Shanghai')
  71. shanghai_tz = pytz.timezone('Asia/Shanghai')
  72. shanghai_dt = utc_dt.astimezone(shanghai_tz)
  73. ret_ts = shanghai_dt.strftime('%Y-%m-%d %H:%M:%S')
  74. return ret_ts
  75. def parse_resp(log, resp, rating_code, sql_pool):
  76. """
  77. 解析响应
  78. :param log: logger对象
  79. :param resp: 响应
  80. :param rating_code: 评级编号
  81. :param sql_pool: sql连接池对象
  82. """
  83. if resp.get("errorCode") == 0:
  84. data = resp.get("data")
  85. img_list = data.get("imgList")
  86. if len(img_list) == 2:
  87. front_img = img_list[0].get("self")
  88. back_img = img_list[1].get("self")
  89. elif len(img_list) == 1:
  90. front_img = img_list[0].get("self")
  91. back_img = None
  92. else:
  93. log.warning(f"{inspect.currentframe().f_code.co_name} -> No img_list:{img_list}")
  94. front_img = None
  95. back_img = None
  96. company_short_name = data.get("companyShortName") # 评级公司简称
  97. goods_name = data.get("goodsName") # 名称
  98. goods_score_name = data.get("goodsScoreName") # 分数
  99. category2 = data.get("category2")
  100. company_id = data.get("companyId")
  101. create_time = data.get("createTime")
  102. create_time = transfer_ts(create_time) if create_time else None
  103. update_time = data.get("updateTime")
  104. update_time = transfer_ts(update_time) if update_time else None
  105. order_code = data.get("orderCode")
  106. card_id = data.get("id")
  107. attr_year = data.get("attr", [])
  108. attr_mapping = {
  109. '年份': 'year',
  110. '发行商': 'publisher',
  111. '卡片系列名称': 'brand',
  112. '子系列名称': 'sub_brand',
  113. '卡片编码': 'card_no',
  114. '居中分数': 'middle_score',
  115. '边框分数': 'border_score',
  116. '卡角分数': 'card_angle_score',
  117. '表面分数': 'surface_score',
  118. '签字分数': 'sign_score',
  119. '限发数': 'issue_limit'
  120. }
  121. res = {}
  122. for a in attr_year:
  123. for key, var_name in attr_mapping.items():
  124. if key in a:
  125. try:
  126. res[var_name] = a.replace(f'{key}:', '').strip()
  127. except Exception as e:
  128. log.error(f"Error parsing {key} from {a}: {e}")
  129. break
  130. else:
  131. # 循环遍历完所有键值对后都没有找到与 a 匹配的 key(即没有通过 break 提前退出),则会执行 else 子句内的代码
  132. log.warning(f"{inspect.currentframe().f_code.co_name} -> a:{a}")
  133. # 统计
  134. # countNum = data.get("countNum")
  135. # countRemark = data.get("countRemark")
  136. # if countNum:
  137. # statistics = f"{goods_score_name} 数量:{countNum}"
  138. # if countRemark:
  139. # statistics = f"{goods_score_name} 数量:{countNum}({countRemark})"
  140. # else:
  141. # statistics = f"{goods_score_name}:{countRemark}"
  142. info = (
  143. rating_code, front_img, back_img, company_short_name, goods_name, goods_score_name, res.get("year"),
  144. res.get("publisher"), res.get("brand"), res.get("sub_brand"), res.get("card_no"), res.get("middle_score"),
  145. res.get("border_score"), res.get("card_angle_score"), res.get("surface_score"), res.get("sign_score"),
  146. res.get("issue_limit"), category2, company_id, create_time, update_time, order_code, card_id)
  147. # print(info)
  148. save_data(sql_pool, info)
  149. else:
  150. log.debug(
  151. f"{inspect.currentframe().f_code.co_name} rating_code:{rating_code} -> errorCode:{resp.get('errorCode')}, msg:{resp.get('msg')}")
  152. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  153. def get_resp(log, rating_code, sql_pool):
  154. """
  155. 获取卡片信息
  156. :param log: logger对象
  157. :param rating_code: 评级编号
  158. :param sql_pool: sql连接池对象
  159. :return:
  160. """
  161. log.info(f"{inspect.currentframe().f_code.co_name} rating_code:{rating_code}")
  162. headers = {
  163. "Accept": "application/json, text/plain, */*",
  164. "Content-Type": "application/json;charset=UTF-8",
  165. "Referer": "https://www.gongbocoins.com/",
  166. "User-Agent": user_agent.generate_user_agent(),
  167. "lang": "en"
  168. }
  169. url = "https://wapi.gongbocoins.com/gbca/orderCoin/getWebsiteRatingInfo"
  170. data = {
  171. "ratingCode": rating_code
  172. }
  173. response = requests.post(url, headers=headers, json=data, proxies=get_proxys(log), timeout=10)
  174. # print(response.json())
  175. # print(response)
  176. response.raise_for_status()
  177. resp_json = response.json()
  178. if resp_json:
  179. parse_resp(log, resp_json, rating_code, sql_pool)
  180. else:
  181. log.warning(f"{inspect.currentframe().f_code.co_name} -> response:{response.status_code}")
  182. def get_811_code_list() -> list:
  183. """
  184. 获取811类 的 code_list
  185. :return: code_list
  186. """
  187. code_list = [code for code in range(8110000000, 8110150000)]
  188. return code_list
  189. def get_821_code_list() -> list:
  190. """
  191. 获取821类 的 code_list
  192. :return: code_list
  193. """
  194. code_list = [code for code in range(8210000000, 8210150000)]
  195. return code_list
  196. def get_851_code_list() -> list:
  197. """
  198. 获取851类 的 code_list
  199. :return: code_list
  200. """
  201. code_list = [code for code in range(8510013072, 8510150000)]
  202. return code_list
  203. @retry(stop=stop_after_attempt(50), wait=wait_fixed(1800), after=after_log)
  204. def gbca_main(log):
  205. """
  206. 主函数
  207. :param log: logger对象
  208. """
  209. log.info(
  210. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  211. # 配置 MySQL 连接池
  212. sql_pool = MySQLConnectionPool(log=log)
  213. if not sql_pool:
  214. log.error("MySQL数据库连接失败")
  215. raise Exception("MySQL数据库连接失败")
  216. try:
  217. # rating_code = "8110008988"
  218. def process_code_list(code_list):
  219. for rating_code in code_list:
  220. try:
  221. get_resp(log, rating_code, sql_pool)
  222. except Exception as ce:
  223. log.error(f"{inspect.currentframe().f_code.co_name} -> error: {ce}")
  224. # code_list_811 = get_811_code_list()
  225. # process_code_list(code_list_811)
  226. #
  227. # code_list_821 = get_821_code_list()
  228. # process_code_list(code_list_821)
  229. code_list_851 = get_851_code_list()
  230. process_code_list(code_list_851)
  231. except Exception as e:
  232. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  233. finally:
  234. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  235. if __name__ == '__main__':
  236. gbca_main(logger)
  237. # aa = {'msg': '操作成功', 'data': {'companyShortName': '北京公博星卡部', 'goodsNotes': None, 'recoverSign': 0, 'ratingAmountLevel': 2, 'recoverRemark': None, 'goodsDefect': '', 'ratingBoxTypePid': None, 'screenSign': 0, 'ratingInstallType': 1, 'id': 14423275, 'goodsScore': '', 'attr': ['年份:2023', '发行商:PANINI', '卡片系列名称:DONRUSS', '卡片编码:#007', '居中分数:N/G', '边框分数:N/G', '卡角分数:N/G', '表面分数:N/G'], 'goodsName': 'RONDALE MOORE', 'category2': 53, 'countNum': '2', 'goodsSize': None, 'screenRemark': None, 'updateTime': None, 'ratingResult': 1, 'goodsScoreStatus': 'AUTH.', 'companyId': 36, 'ratingCode': '8110100083', 'createTime': 1702608381650, 'orderCode': '5221702608381652', 'coinName': 'RONDALE MOORE', 'goodsDefectName': None, 'countRemark': '含代码部分', 'customerTel': '13691236280', 'goodsScoreName': 'AUTH.', 'imgList': [{'self': 'https://imgcdnwww.gongbocoins.com/202502271733/d9ddd683daf93722c62baad3961809f8/Photo%2FukluhROUC8husk4held4%2F8110100083-1.jpg%2Fself', 'list': 'https://imgcdnwww.gongbocoins.com/202502271733/0ded1c6de30e58ebb7a0afacd8f7cc1f/Photo%2FukluhROUC8husk4held4%2F8110100083-1.jpg%2Flist'}, {'self': 'https://imgcdnwww.gongbocoins.com/202502271733/a800fcca54c8c5dd6f8a581b6f0624cb/Photo%2FukluhROUC8husk4held4%2F8110100083.jpg%2Fself', 'list': 'https://imgcdnwww.gongbocoins.com/202502271733/5091de46fbad3b48a1b4862751b2d08c/Photo%2FukluhROUC8husk4held4%2F8110100083.jpg%2Flist'}]}, 'errorCode': 0}
  238. # parse_resp(logger, aa, "8110100083", None)