hug_spider.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.12.10
  4. # Date : 2026/5/19 16:56
  5. import time
  6. import inspect
  7. import requests
  8. import schedule
  9. import user_agent
  10. from loguru import logger
  11. from parsel import Selector
  12. from mysql_pool import MySQLConnectionPool
  13. from tenacity import retry, stop_after_attempt, wait_fixed
  14. """
  15. 目标网站: https://hugginsandscott.com/auction/2026/winter/
  16. """
  17. logger.remove()
  18. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  19. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  20. level="DEBUG", retention="7 day")
  21. headers = {
  22. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  23. "user-agent": user_agent.generate_user_agent()
  24. }
  25. def after_log(retry_state):
  26. """
  27. retry 回调
  28. :param retry_state: RetryCallState 对象
  29. """
  30. # 检查 args 是否存在且不为空
  31. if retry_state.args and len(retry_state.args) > 0:
  32. log = retry_state.args[0] # 获取传入的 logger
  33. else:
  34. log = logger # 使用全局 logger
  35. if retry_state.outcome.failed:
  36. log.warning(
  37. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  38. else:
  39. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  40. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  41. def get_proxys(log):
  42. http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  43. https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  44. try:
  45. proxySettings = {
  46. "http": http_proxy,
  47. "https": https_proxy,
  48. }
  49. return proxySettings
  50. except Exception as e:
  51. log.error(f"Error getting proxy: {e}")
  52. raise e
  53. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  54. def get_details(log, url, sql_pool, sql_id):
  55. """
  56. 获取详情数据
  57. :param log: logger对象
  58. :param url: 详情页URL
  59. :param sql_pool: MySQL连接池
  60. :param sql_id: 数据ID
  61. :return: 标题和描述
  62. """
  63. log.info(f">>>>>>>>>>>>>> 正在爬取详情数据URL: {url} <<<<<<<<<<<<<<")
  64. response = requests.get(url, headers=headers, timeout=10, proxies=get_proxys(log))
  65. response.raise_for_status()
  66. selector = Selector(response.text)
  67. year = selector.xpath('//section//div/ul/li[2]/text()').get()
  68. year = year.replace('Year: ', '') if year else None
  69. auction = selector.xpath('//section//div/ul/li[3]/text()').get()
  70. auction = auction.replace('Auction: ', '') if auction else None
  71. imgs = selector.xpath('//section//button/span/img/@src').getall()
  72. imgs = ','.join(imgs) if imgs else None
  73. # print(year, auction, imgs)
  74. # 更新数据和状态
  75. sql_pool.update_one_or_dict(
  76. table="hugginsandscott_record",
  77. data={"year": year, "auction": auction, "imgs": imgs, "state": 1},
  78. condition={"id": sql_id}
  79. )
  80. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  81. def get_single_page(log, page, auc_url, sql_pool):
  82. """
  83. 获取单页数据
  84. :param log: logger对象
  85. :param page: 页码
  86. :param auc_url: 拍卖会URL
  87. :param sql_pool: MySQL连接池
  88. :return: 该页数据条数
  89. """
  90. log.info(f"Requesting page {page} for auction:{auc_url}........................... started")
  91. # url = "https://hugginsandscott.com/auction/2026/winter/"
  92. params = {
  93. # "page": "3"
  94. "page": page
  95. }
  96. response = requests.get(auc_url, headers=headers, params=params, timeout=10, proxies=get_proxys(log))
  97. # print(response.text)
  98. response.raise_for_status()
  99. selector = Selector(response.text)
  100. tag_tr_list = selector.xpath("//table/tbody/tr")
  101. info_list = []
  102. for tag_tr in tag_tr_list:
  103. lot_no = tag_tr.xpath("./td[1]/text()").get()
  104. title = tag_tr.xpath("./td[2]/a/text()").get()
  105. title = title.strip() if title else ""
  106. detail_url = tag_tr.xpath("./td[2]/a/@href").get()
  107. detail_url = 'https://hugginsandscott.com' + detail_url if detail_url else ""
  108. category = tag_tr.xpath("./td[3]/text()").get()
  109. sold_for = tag_tr.xpath("./td[4]/text()").get()
  110. sold_for = sold_for.replace(",", "").replace("$", "") if sold_for else "0"
  111. data_dict = {
  112. "auction_url": auc_url,
  113. "lot_no": lot_no,
  114. "title": title,
  115. "detail_url": detail_url,
  116. "category": category,
  117. "sold_for": sold_for
  118. }
  119. # print(data_dict)
  120. info_list.append(data_dict)
  121. # save
  122. if info_list:
  123. sql_pool.insert_many(table="hugginsandscott_record", data_list=info_list, ignore=True)
  124. return len(info_list)
  125. def get_sold_list(log, auc_url, sql_pool):
  126. """
  127. 获取已售列表
  128. :param log: logger对象
  129. :param auc_url: 拍卖会URL
  130. :param sql_pool: MySQL连接池
  131. :return: 无
  132. """
  133. page = 1
  134. max_page = 10
  135. while page <= max_page:
  136. try:
  137. len_list = get_single_page(log, page, auc_url, sql_pool)
  138. except Exception as e:
  139. log.error(f"Error getting page {page}: {e}")
  140. continue
  141. if len_list < 10:
  142. log.warning(f"No data on page {page}, stopping further requests")
  143. break
  144. page += 1
  145. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  146. def get_all_auctions(log, sql_pool):
  147. """
  148. 获取所有拍卖会
  149. :param log: logger对象
  150. :param sql_pool: MySQL连接池
  151. :return: 无
  152. """
  153. log.info(">>>>>>>>>>>>>> 正在获取所有拍卖会 <<<<<<<<<<<<")
  154. # //div[@class="max-w-screen-xl px-4 py-8 mx-auto"]//a/@href
  155. url = 'https://hugginsandscott.com/search'
  156. response = requests.get(url, headers=headers, timeout=10, proxies=get_proxys(log))
  157. response.raise_for_status()
  158. selector = Selector(response.text)
  159. tag_a_list = selector.xpath("//div[@class='max-w-screen-xl px-4 py-8 mx-auto']//a/@href").getall()
  160. log.info(f"Total auctions: {len(tag_a_list)}")
  161. # 补全 https://hugginsandscott.com/
  162. tag_a_list = [f"https://hugginsandscott.com{tag_a}" for tag_a in tag_a_list]
  163. # 批量查询库中已存在的 URL,避免 N 次单条查询
  164. if tag_a_list:
  165. placeholders = ','.join(['%s'] * len(tag_a_list))
  166. sql = f"SELECT auction_url FROM hugginsandscott_record WHERE auction_url IN ({placeholders})"
  167. existed_rows = sql_pool.select_all(sql, tuple(tag_a_list))
  168. # 库中已存在的 URL 集合,便于 O(1) 查找
  169. existed_urls = {row[0] for row in existed_rows}
  170. # 用集合差集筛选出不在库中的新拍卖会 URL
  171. new_tag_a_list = [tag_a for tag_a in tag_a_list if tag_a not in existed_urls]
  172. else:
  173. new_tag_a_list = []
  174. log.info(f"Total auctions after filter: {len(new_tag_a_list)}")
  175. # 打印不在库中的新 URL 列表
  176. for auc_url in new_tag_a_list:
  177. try:
  178. log.info(f"Getting sold list for:{auc_url}...........................")
  179. get_sold_list(log, auc_url, sql_pool)
  180. except Exception as e:
  181. log.error(f"{inspect.currentframe().f_code.co_name} -> getting sold list for:{auc_url}: {e}")
  182. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  183. def hug_main(log):
  184. """
  185. 主函数
  186. :param log: logger对象
  187. """
  188. log.info(
  189. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  190. # 配置 MySQL 连接池
  191. sql_pool = MySQLConnectionPool(log=log)
  192. if not sql_pool:
  193. log.error("MySQL数据库连接失败")
  194. raise Exception("MySQL数据库连接失败")
  195. try:
  196. try:
  197. get_all_auctions(log, sql_pool)
  198. except Exception as e:
  199. log.error(f'Error getting all auctions list: {e}')
  200. # 更新详情页
  201. log.debug('Updating detail pages........................... started')
  202. # sql_result = sql_pool.select_all('select id, detail_url from hugginsandscott_record where state = 0')
  203. sql_result = sql_pool.select_all(
  204. 'select id, detail_url from hugginsandscott_record where state != 1 order by id')
  205. for row in sql_result:
  206. sql_id = row[0]
  207. detail_url = row[1]
  208. try:
  209. get_details(log, detail_url, sql_pool, sql_id)
  210. except Exception as e:
  211. log.error(f'Error getting details for {detail_url}: {e}')
  212. # 更新数据和状态
  213. sql_pool.update_one_or_dict(
  214. table="hugginsandscott_record",
  215. data={"state": 2},
  216. condition={"id": sql_id}
  217. )
  218. except Exception as e:
  219. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  220. finally:
  221. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  222. def schedule_task():
  223. """每半个月 跑一次增量"""
  224. hug_main(log=logger)
  225. def run_semimonthly():
  226. # 每月 1 号和 15 号执行(半月一次)
  227. from datetime import date
  228. if date.today().day in (1, 15):
  229. hug_main(log=logger)
  230. schedule.every().day.at("05:00").do(run_semimonthly)
  231. while True:
  232. schedule.run_pending()
  233. time.sleep(1)
  234. if __name__ == '__main__':
  235. # get_single_page(logger, 1, None)d
  236. # get_details(logger,
  237. # "https://hugginsandscott.com/auction/2026/Winter/1/1952-topps-311-mickey-mantle-high-number-psa-vg-3-mba-silver-diamond-centered",
  238. # None, 1)
  239. # hug_main(log=logger)
  240. schedule_task()
  241. # sql_pool_ = MySQLConnectionPool(log=logger)
  242. # get_all_auctions(logger, sql_pool_)