auctions90s_spider.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.12.10
  4. # Date : 2026/5/28
  5. """
  6. 90sAuctions 增量爬虫(半月调度)
  7. 逻辑:
  8. 1. GET 首页解析当前网站全部 auction id
  9. 2. 查库 select distinct auction_id from auctions90s_record,得到已爬过的 auction
  10. 3. 差集 = 新增 auction
  11. 4. 没有新增 → 本轮无数据可抓,结束
  12. 5. 对每个新增 auction:postback 切换 → 翻页 → 写库
  13. 6. 补抓 state != 1 的详情页
  14. """
  15. import time
  16. import random
  17. import inspect
  18. import schedule
  19. from curl_cffi import requests
  20. from loguru import logger
  21. from tenacity import retry, stop_after_attempt, wait_fixed
  22. from mysql_pool import MySQLConnectionPool
  23. from auctions90s_core import (
  24. TABLE_NAME,
  25. client_identifier_list,
  26. crawl_one_auction,
  27. get_auction_list,
  28. update_details_for_pending,
  29. after_log,
  30. )
  31. logger.remove()
  32. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  33. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  34. level="DEBUG", retention="7 day")
  35. def get_existing_auction_ids(log, sql_pool):
  36. """
  37. 查库返回已爬过的 auction_id 集合。
  38. :param log: (loguru.Logger) 日志对象
  39. :param sql_pool: (MySQLConnectionPool) 数据库连接池
  40. :return: (set[str]) 已存在的 auction_id 集合(字符串形式,与首页解析值对齐)
  41. """
  42. rows = sql_pool.select_all(
  43. f"select distinct auction_id from {TABLE_NAME} where auction_id is not null"
  44. )
  45. ids = {str(r[0]) for r in rows} if rows else set()
  46. log.info(f"库中已存在 {len(ids)} 个 auction_id: {sorted(ids)}")
  47. return ids
  48. def diff_new_auctions(log, all_auctions, existing_ids):
  49. """
  50. 从首页解析的全部 auctions 中筛出库里没有的。
  51. :param log: (loguru.Logger) 日志对象
  52. :param all_auctions: (list[dict]) get_auction_list 返回的全部拍卖会列表
  53. :param existing_ids: (set[str]) 已存在的 auction_id 集合
  54. :return: (list[dict]) 新增待抓的 auction 列表
  55. """
  56. new_list = [a for a in all_auctions if a["id"] not in existing_ids]
  57. log.info(f"新增待抓取 auction 数: {len(new_list)} -> {[(a['id'], a['name']) for a in new_list]}")
  58. return new_list
  59. def run_incremental(log, sql_pool):
  60. """
  61. 增量抓取主流程:拉首页 → 差集 → 逐个抓新增 auction。
  62. :param log: (loguru.Logger) 日志对象
  63. :param sql_pool: (MySQLConnectionPool) 数据库连接池
  64. :return: None
  65. """
  66. impersonate = random.choice(client_identifier_list)
  67. with requests.Session() as session:
  68. try:
  69. all_auctions = get_auction_list(log, session, impersonate)
  70. except Exception as e:
  71. log.error(f"获取拍卖会列表失败: {e}")
  72. return
  73. existing_ids = get_existing_auction_ids(log, sql_pool)
  74. new_auctions = diff_new_auctions(log, all_auctions, existing_ids)
  75. if not new_auctions:
  76. log.info("本轮无新增 auction,跳过 list 抓取")
  77. return
  78. for idx, auc in enumerate(new_auctions, 1):
  79. aid, name = auc["id"], auc["name"]
  80. log.info(f"========== [{idx}/{len(new_auctions)}] 开始抓 auction={aid} ({name}) ==========")
  81. try:
  82. crawl_one_auction(log, sql_pool, session, impersonate,
  83. auction_id=aid, auction_name=name)
  84. except Exception as e:
  85. log.error(f"auction={aid} 抓取异常: {e}")
  86. continue
  87. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  88. def nineties_main(log):
  89. """
  90. 日调度主函数:增量 list + 补详情。失败时按小时级重试(最多 100 次)。
  91. :param log: (loguru.Logger) 日志对象
  92. :return: None
  93. :raises Exception: MySQL 连接失败时抛出,由 tenacity 触发整轮重试
  94. """
  95. log.info(f'开始运行 {inspect.currentframe().f_code.co_name} 增量爬虫任务 ...')
  96. sql_pool = MySQLConnectionPool(log=log)
  97. if not sql_pool:
  98. log.error("MySQL数据库连接失败")
  99. raise Exception("MySQL数据库连接失败")
  100. try:
  101. try:
  102. run_incremental(log, sql_pool)
  103. except Exception as e:
  104. log.error(f'增量抓取失败: {e}')
  105. try:
  106. update_details_for_pending(log, sql_pool)
  107. except Exception as e:
  108. log.error(f'详情补抓失败: {e}')
  109. except Exception as e:
  110. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  111. finally:
  112. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮采集 ...')
  113. def schedule_task():
  114. """
  115. 启动半月调度:脚本启动时先跑一次,之后每月 1 号和 15 号 05:00 各跑一次。
  116. :return: None(永不返回,内部死循环)
  117. """
  118. nineties_main(log=logger)
  119. def run_semimonthly():
  120. # 每月 1 号和 15 号执行(半月一次)
  121. from datetime import date
  122. if date.today().day in (1, 15):
  123. nineties_main(log=logger)
  124. schedule.every().day.at("05:00").do(run_semimonthly)
  125. while True:
  126. schedule.run_pending()
  127. time.sleep(1)
  128. if __name__ == '__main__':
  129. # nineties_main(log=logger)
  130. schedule_task()