snk_single_card_spider.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2026/4/15 15:10
  5. import time
  6. import inspect
  7. import requests
  8. import schedule
  9. import user_agent
  10. from loguru import logger
  11. from mysql_pool import MySQLConnectionPool
  12. from tenacity import retry, stop_after_attempt, wait_fixed
  13. logger.remove()
  14. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  15. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  16. level="DEBUG", retention="7 day")
  17. cookie = "ENSID=MTc3NjMxMDE1MXw1LThLaEQ5OVRZTDBLbktHUzhpTG9PR01ybUQwYmxyb2dqMlJETVpUN3dCVVV3aWpwbXE0UGtDYXkwN2ZlUU9ib3I2a3ZCa21xZGdDQjc2aVBVNElHWDlXNXlPUS1YZVd816ifsPbu7_r0ouHJxFdsRYT3jsCwBFEZ6IMhzHnGcyI=; _pin_unauth=dWlkPU5qVXlOak16TlRJdE1UVmxNUzAwTUdZNExXRmpZakl0WlRJMU1EazBaVEl6WVRRMw; _tt_enable_cookie=1; _ttp=01KP7TC2C47N8816E74NZ2S87G_.tt.1; aws-waf-token=85ea0abd-c7e7-44fa-a486-721005517367:BgoAddgnTIMdAAAA:sgUJ0isHGRMm9HGMWaserzc0yH/cfmcnAJs7tApXkvxu8CkSB2W2/+vEB9V4uBUqE+8uegKHQINRE2ExEMC9XRl6QLHoC16s5mOsvrptUYDuWqAnyQJcr8a6dAlUpokqmdLFzLRoiM2digCAKXmKRM5fbEQgY56lCzRpNqolUtcS/X9zZQIfJnj2GfmAjNw=; _gcl_au=1.1.140783051.1776231515.1213660920.1776231619.1776231619; _ga_T9G4FWRKGP=GS2.1.s1776309189$o1$g0$t1776309189$j60$l0$h0; _gid=GA1.2.339334579.1776309190; __lt__cid=074b5327-9f75-4356-a201-9879abb859a5; __lt__sid=1cd194ae-b7f570f4; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22undefined%22%2C%22expiryDate%22%3A%222027-04-16T03%3A13%3A11.377Z%22%7D; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%221CbwgL9cLbT9BLfXJtkK%22%2C%22expiryDate%22%3A%222027-04-16T03%3A13%3A11.379Z%22%7D; _ga_WLFPCJHLHL=GS2.1.s1776309194$o1$g0$t1776309194$j60$l0$h0; _ga=GA1.1.342828207.1776231516; forterToken=6f95dbdd5df5486fb9bfe93652a6b6a7_1776309188237__UDF43-m4_27ck_; ttcsid_CEM1KGBC77U8BHMFF6SG=1776309193318::MmzC2Nw5ahbgUD_ovmJa.1.1776309203360.1; _dd_s=aid=c68687bf-9ca5-4040-9266-a9c8281287b7&logs=1&id=f2443d73-de49-431a-9dba-17f30b9410ac&created=1776309188187&expire=1776310738774&rum=0; _rdt_uuid=1776231515762.27c25d2a-f2b5-4370-89ca-ba2ba6d93c35; _rdt_em=:7fa565b08bc719fc95a07f3f9cbb8cfcd715b62ce82bc26739d3074a5196870c; ttcsid_CAP79SBC77U56BB6BI50=1776309194536::zh_5-OLx-MD4DmA4jALH.4.1776310041216.1; ttcsid=1776309194523::bhCq-3lisAc3SvWnrZng.4.1776310041216.0::1.845885.846484::955506.51.1579.4652::954188.255.4300; _ga_6H1EYVVN53=GS2.1.s1776308613$o5$g1$t1776310150$j60$l0$h0; _ga_3722WCREQR=GS2.1.s1776308613$o5$g1$t1776310150$j60$l0$h0"
  18. headers = {
  19. "accept": "application/json",
  20. # "referer": "https://snkrdunk.com/en/trading-cards/671489?slide=right",
  21. "user-agent": user_agent.generate_user_agent(),
  22. "cookie": cookie
  23. }
  24. def after_log(retry_state):
  25. """
  26. retry 回调
  27. :param retry_state: RetryCallState 对象
  28. """
  29. # 检查 args 是否存在且不为空
  30. if retry_state.args and len(retry_state.args) > 0:
  31. log = retry_state.args[0] # 获取传入的 logger
  32. else:
  33. log = logger # 使用全局 logger
  34. if retry_state.outcome.failed:
  35. log.warning(
  36. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  37. else:
  38. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  39. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  40. def get_proxys(log):
  41. """
  42. 获取代理配置
  43. :param log: 日志对象
  44. :return: 代理字典
  45. """
  46. tunnel = "x371.kdltps.com:15818"
  47. kdl_username = "t13753103189895"
  48. kdl_password = "o0yefv6z"
  49. try:
  50. proxies = {
  51. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
  52. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
  53. }
  54. return proxies
  55. except Exception as e:
  56. log.error(f"Error getting proxy: {e}")
  57. raise e
  58. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  59. def get_single_page(log, page, brand):
  60. """
  61. 获取单页数据
  62. :param log: 日志对象
  63. :param page: 页码
  64. :param brand: 品牌
  65. :return: 数据列表
  66. """
  67. log.info(f"获取第 {page} 页数据,品牌为 {brand}....................................................")
  68. url = "https://snkrdunk.com/en/v1/trading-cards/used"
  69. params = {
  70. "brandId": brand,
  71. # "brandId": "pokemon",
  72. "categoryId": "25",
  73. # "page": "29",
  74. "page": page,
  75. "perPage": "20",
  76. "sortType": "latest",
  77. "isOnlyOnSale": "false"
  78. }
  79. response = requests.get(url, headers=headers, params=params, proxies=get_proxys(log), timeout=22)
  80. response.raise_for_status()
  81. resp_json = response.json()
  82. # print(resp_json)
  83. usedTradingCards = resp_json.get("usedTradingCards", [])
  84. return usedTradingCards
  85. def parse_data(log, resp_list, brand, sql_pool):
  86. """
  87. 解析数据
  88. :param log: 日志对象
  89. :param resp_list: 数据列表
  90. :param brand: 品牌
  91. :param sql_pool: 数据库连接池
  92. """
  93. try:
  94. dict_list = []
  95. for data in resp_list:
  96. used_id = data.get("id")
  97. tradingCardId = data.get("tradingCardId")
  98. card_name = data.get("tradingCardName")
  99. listing_uid = data.get("listingUID") # 进入详情页的id
  100. price = data.get("price")
  101. price = price.replace("US $", "").replace(",", "")
  102. condition = data.get("condition")
  103. front_img = data.get("thumbnailUrl")
  104. # 去除图片中的大小格式 https://cdn.snkrdunk.com/apparel_used_listings/49c34dac-27d9-4b7b-96db-72a6c70a464c/5907002.jpeg?size=m
  105. front_img = front_img.split("?")[0]
  106. is_sold = data.get("isSold")
  107. data_dict = {
  108. "brand": brand,
  109. "used_id": used_id,
  110. "trading_card_id": tradingCardId,
  111. "card_name": card_name,
  112. "listing_uid": listing_uid,
  113. "price": price,
  114. "score": condition,
  115. "front_img": front_img,
  116. "is_sold": is_sold,
  117. "category": "Trading Cards (Single Card)"
  118. }
  119. # print(data_dict)
  120. dict_list.append(data_dict)
  121. sql_pool.insert_many(table="snkrdunk_record", data_list=dict_list, ignore=True)
  122. except Exception as e:
  123. log.error(f"Error parsing data: {e}")
  124. def get_list_data(log, brand, sql_pool):
  125. """
  126. 获取列表数据
  127. :param log: 日志对象
  128. :param brand: 品牌
  129. :param sql_pool: 数据库连接池
  130. """
  131. page = 1
  132. while True:
  133. try:
  134. data_list = get_single_page(log, page, brand)
  135. if not data_list:
  136. log.info(f"No more data for brand {brand}, page {page}")
  137. break
  138. except Exception as e:
  139. log.error(f"Error getting page {page} for brand {brand}: {e}")
  140. data_list = []
  141. parse_data(log, data_list, brand, sql_pool)
  142. if len(data_list) < 20:
  143. log.info(f"No more data for brand {brand}, page {page}")
  144. break
  145. page += 1
  146. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  147. def snk_main(log):
  148. """
  149. 主函数
  150. :param log: logger对象
  151. """
  152. log.info(
  153. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  154. # 配置 MySQL 连接池
  155. sql_pool = MySQLConnectionPool(log=log)
  156. if not sql_pool.check_pool_health():
  157. log.error("数据库连接池异常")
  158. raise RuntimeError("数据库连接池异常")
  159. brand_list = ["pokemon", "onepiece", "yu-gi-oh"]
  160. try:
  161. for brand in brand_list:
  162. log.info(f'开始采集 {brand} 数据....................................................')
  163. try:
  164. get_list_data(log, brand, sql_pool)
  165. except Exception as e:
  166. log.error(f'采集 {brand} 数据异常: {e}')
  167. except Exception as e:
  168. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  169. finally:
  170. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  171. def schedule_task():
  172. """
  173. 爬虫模块 定时任务 的启动文件
  174. """
  175. # 立即运行一次任务
  176. # snk_main(log=logger)
  177. # 设置定时任务
  178. schedule.every().day.at("00:01").do(snk_main, log=logger)
  179. while True:
  180. schedule.run_pending()
  181. time.sleep(1)
  182. if __name__ == '__main__':
  183. schedule_task()
  184. # snk_main(log=logger)