veriswap_spider.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/12/8 19:23
  5. import random
  6. import time
  7. import inspect
  8. import requests
  9. import schedule
  10. import user_agent
  11. from loguru import logger
  12. from datetime import datetime
  13. from mysql_pool import MySQLConnectionPool
  14. from tenacity import retry, stop_after_attempt, wait_fixed
  15. """
  16. 沃瑞斯外部
  17. """
  18. logger.remove()
  19. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  20. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  21. level="DEBUG", retention="7 day")
  22. def after_log(retry_state):
  23. """
  24. retry 回调
  25. :param retry_state: RetryCallState 对象
  26. """
  27. # 检查 args 是否存在且不为空
  28. if retry_state.args and len(retry_state.args) > 0:
  29. log = retry_state.args[0] # 获取传入的 logger
  30. else:
  31. log = logger # 使用全局 logger
  32. if retry_state.outcome.failed:
  33. log.warning(
  34. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  35. else:
  36. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  37. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  38. def get_proxys(log):
  39. """
  40. 获取代理
  41. :return: 代理
  42. """
  43. tunnel = "x371.kdltps.com:15818"
  44. kdl_username = "t13753103189895"
  45. kdl_password = "o0yefv6z"
  46. try:
  47. proxies = {
  48. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
  49. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
  50. }
  51. return proxies
  52. except Exception as e:
  53. log.error(f"Error getting proxy: {e}")
  54. raise e
  55. @retry(stop=stop_after_attempt(5), wait=wait_fixed(5), after=after_log)
  56. def get_single_page(log, page, sql_pool):
  57. """
  58. 获取单页商品
  59. :param log: logger对象
  60. :param page: 页码
  61. :param sql_pool: MySQL连接池对象
  62. :return: ietms 长度
  63. """
  64. log.debug(f"............... 开始获取第 {page} 页数据 ...............")
  65. headers = {
  66. "content-type": "application/json",
  67. "referer": "https://veriswap.com/",
  68. "user-agent": user_agent.generate_user_agent()
  69. }
  70. url = "https://veriswap-backend-689107296832.us-central1.run.app/cards/get/all"
  71. data = {
  72. "query": "",
  73. "page": page,
  74. "filterBy": "isHidden:=false",
  75. "sortBy": "updatedAt:desc"
  76. }
  77. response = requests.post(url, headers=headers, json=data, timeout=22)
  78. # print(response.json())
  79. response.raise_for_status()
  80. resp_json = response.json().get("items", [])
  81. if not resp_json:
  82. log.debug("没有数据........")
  83. log.debug(response.json())
  84. return 0
  85. info_list = []
  86. for item in resp_json:
  87. card_id = item.get("cardId")
  88. card_name = item.get("cardName")
  89. condition = item.get("condition")
  90. created_at = item.get("createdAt") # 1747107593
  91. created_at = datetime.fromtimestamp(created_at).strftime('%Y-%m-%d %H:%M:%S')
  92. updated_at = item.get("updatedAt")
  93. updated_at = datetime.fromtimestamp(updated_at).strftime('%Y-%m-%d %H:%M:%S')
  94. era = item.get("era")
  95. for_sale = item.get("forSale")
  96. images = item.get("images", {})
  97. img_front = images.get("front")
  98. img_back = images.get("back")
  99. is_vaulted = item.get("isVaulted")
  100. parallel = item.get("parallel")
  101. player_name = item.get("playerName")
  102. price = item.get("price")
  103. print_run = item.get("printRun")
  104. set_name = item.get("set")
  105. sport = item.get("sport")
  106. card_type = item.get("type")
  107. user_id = item.get("userId")
  108. year = item.get("year")
  109. data_dict = {
  110. "card_id": card_id,
  111. "card_name": card_name,
  112. "card_condition": condition,
  113. "created_at": created_at,
  114. "updated_at": updated_at,
  115. "era": era,
  116. "for_sale": for_sale,
  117. "img_front": img_front,
  118. "img_back": img_back,
  119. "is_vaulted": is_vaulted,
  120. "parallel": parallel,
  121. "player_name": player_name,
  122. "price": price,
  123. "print_run": print_run,
  124. "set_name": set_name,
  125. "sport": sport,
  126. "card_type": card_type,
  127. "user_id": user_id,
  128. "year": year
  129. }
  130. # log.debug(data_dict)
  131. info_list.append(data_dict)
  132. # 保存数据
  133. if info_list:
  134. sql_pool.insert_many(table="veriswap_card_record", data_list=info_list, ignore=True)
  135. return len(resp_json)
  136. def get_all_page(log, sql_pool):
  137. """
  138. 获取所有页码商品 翻页
  139. :param log: logger对象
  140. :param sql_pool: MySQL连接池对象
  141. """
  142. # page = 1
  143. page = 16903
  144. max_page = 20000
  145. while page <= max_page:
  146. try:
  147. len_list = get_single_page(log, page, sql_pool)
  148. except Exception as e:
  149. log.error(f'Request get_single_page error: {e}')
  150. len_list = 0
  151. if len_list < 20:
  152. log.debug(f'当前页码为: {page}, 数据长度为: {len_list}, 停止爬取 !!!')
  153. break
  154. page += 1
  155. # time.sleep(random.uniform(1, 2))
  156. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  157. def wap_main(log):
  158. """
  159. 主函数 自动售货机
  160. :param log: logger对象
  161. """
  162. log.info(
  163. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  164. # 配置 MySQL 连接池
  165. sql_pool = MySQLConnectionPool(log=log)
  166. if not sql_pool.check_pool_health():
  167. log.error("数据库连接池异常")
  168. raise RuntimeError("数据库连接池异常")
  169. try:
  170. try:
  171. log.debug('------------------- 获取所有页码商品 -------------------')
  172. get_all_page(log, sql_pool)
  173. except Exception as e:
  174. log.error(f'get_all_page error: {e}')
  175. except Exception as e:
  176. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  177. finally:
  178. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  179. def schedule_task():
  180. """
  181. 爬虫模块 定时任务 的启动文件
  182. """
  183. # 立即运行一次任务
  184. # wap_main(log=logger)
  185. # 设置定时任务
  186. schedule.every().day.at("00:01").do(wap_main, log=logger)
  187. while True:
  188. schedule.run_pending()
  189. time.sleep(1)
  190. if __name__ == '__main__':
  191. wap_main(logger)
  192. # get_all_page(logger, None)