courtyard_spider.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/12/2 14:08
  5. import threading
  6. import time
  7. import inspect
  8. import requests
  9. import schedule
  10. import user_agent
  11. from loguru import logger
  12. from parsel import Selector
  13. from datetime import datetime
  14. from mysql_pool import MySQLConnectionPool
  15. from DrissionPage import ChromiumPage, ChromiumOptions
  16. from tenacity import retry, stop_after_attempt, wait_fixed
  17. """
  18. 扣驾的
  19. """
  20. logger.remove()
  21. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  22. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  23. level="DEBUG", retention="7 day")
  24. headers = {
  25. "accept": "application/json",
  26. "referer": "https://courtyard.io/",
  27. "user-agent": user_agent.generate_user_agent()
  28. }
  29. # 全局变量标识首次运行是否完成
  30. detail_first_run_completed = False
  31. def after_log(retry_state):
  32. """
  33. retry 回调
  34. :param retry_state: RetryCallState 对象
  35. """
  36. # 检查 args 是否存在且不为空
  37. if retry_state.args and len(retry_state.args) > 0:
  38. log = retry_state.args[0] # 获取传入的 logger
  39. else:
  40. log = logger # 使用全局 logger
  41. if retry_state.outcome.failed:
  42. log.warning(
  43. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  44. else:
  45. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  46. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  47. def get_proxys(log):
  48. """
  49. 获取代理
  50. :return: 代理
  51. """
  52. tunnel = "x371.kdltps.com:15818"
  53. kdl_username = "t13753103189895"
  54. kdl_password = "o0yefv6z"
  55. try:
  56. proxies = {
  57. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
  58. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
  59. }
  60. return proxies
  61. except Exception as e:
  62. log.error(f"Error getting proxy: {e}")
  63. raise e
  64. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  65. def get_goods_list(log, sql_pool):
  66. """
  67. 获取商品列表
  68. :param log: logger对象
  69. :param sql_pool: MySQL连接池对象
  70. :return:
  71. """
  72. log.info(f"========================== 开始获取商品列表 ==========================")
  73. url = "https://api.courtyard.io/vending-machines"
  74. response = requests.get(url, headers=headers, timeout=22)
  75. # print(response.text)
  76. response.raise_for_status()
  77. vendingMachines = response.json().get("vendingMachines", [])
  78. for item in vendingMachines:
  79. bag_id = item.get("id")
  80. bag_title = item.get("title")
  81. # sealed_pack_animation = item.get("sealedPackAnimation")
  82. # sealed_pack_image = item.get("sealedPackImage")
  83. category_title = item.get("category", {}).get("title")
  84. price = item.get("saleDetails", {}).get("salePriceUsd")
  85. data_dict = {
  86. "bag_id": bag_id,
  87. "bag_title": bag_title,
  88. "category": category_title,
  89. "price":price
  90. }
  91. # log.info(f'get_goods_list: {data_dict}')
  92. try:
  93. get_goods_detail(log, data_dict, sql_pool)
  94. except Exception as e:
  95. log.error(f"Error processing item: {e}")
  96. # 保存数据
  97. # if info_list:
  98. # log.info(f"获取商品列表成功, 共 {len(info_list)} 条数据")
  99. # sql_pool.insert_many(table="courtyard_vending_machines_record", data_list=info_list, ignore= True)
  100. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  101. def get_goods_detail(log, query_dict: dict, sql_pool=None):
  102. """
  103. 获取商品详情
  104. :param log: logger对象
  105. :param query_dict: query_dict
  106. :param sql_pool: MySQL连接池对象
  107. :return:
  108. """
  109. log.info(f"========================== 获取商品详情 ==========================")
  110. url = "https://api.courtyard.io/index/query/recent-pulls"
  111. params = {
  112. "limit": "250",
  113. # "vendingMachineIds": "pkmn-basic-pack"
  114. "vendingMachineIds": query_dict["bag_id"]
  115. }
  116. response = requests.get(url, headers=headers, params=params, timeout=22)
  117. # print(response.text)
  118. response.raise_for_status()
  119. pulls = response.json().get("assets", [])
  120. info_list = []
  121. for item in pulls:
  122. detail_title = item.get("title")
  123. detail_id = item.get("proof_of_integrity")
  124. if not detail_id:
  125. log.error(f"信息异常, detail_id: {detail_id}")
  126. continue
  127. asset_pictures = item.get("asset_pictures", [])
  128. img_front = asset_pictures[0] if len(asset_pictures) > 0 else None
  129. img_back = asset_pictures[1] if len(asset_pictures) > 1 else None
  130. # crawl_date = time.strftime("%Y-%m-%d", time.localtime())
  131. data_dict = {
  132. "bag_id": query_dict["bag_id"],
  133. "bag_title": query_dict["bag_title"],
  134. "category": query_dict["category"],
  135. "price": query_dict["price"],
  136. "detail_id": detail_id,
  137. "detail_title": detail_title,
  138. "img_front": img_front,
  139. "img_back": img_back,
  140. # "crawler_date": crawl_date
  141. }
  142. # log.info(f'data_dict:{data_dict}')
  143. info_list.append(data_dict)
  144. # 保存数据
  145. if info_list:
  146. log.info(f"获取商品详情成功, 共 {len(info_list)} 条数据")
  147. sql_pool.insert_many(table="courtyard_list_record", data_list=info_list, ignore=True)
  148. def convert_time_format(time_str):
  149. """
  150. 将时间字符串转换为标准格式
  151. :param time_str: 原始时间字符串,如 "December 3, 2025 at 4:29 PM"
  152. :return: 标准时间格式字符串,如 "2025-12-03 16:29:00"
  153. """
  154. if not time_str:
  155. return None
  156. try:
  157. dt_obj = datetime.strptime(time_str, "%B %d, %Y at %I:%M %p")
  158. return dt_obj.strftime("%Y-%m-%d %H:%M:%S")
  159. except ValueError as e:
  160. logger.warning(f"时间转换失败: {time_str}, 错误: {e}")
  161. return None
  162. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  163. def get_sale_detail_single_page(log, page, sql_id, detail_id, sql_pool=None):
  164. """
  165. 获取商品详情
  166. :param log: logger对象
  167. :param page: page对象
  168. :param sql_id: 数据库id
  169. :param detail_id: 商品详情id
  170. :param sql_pool: MySQL连接池对象
  171. :return:
  172. """
  173. log.info(f"========================== 获取商品 <sale> 详情, sql_id: {sql_id} ==========================")
  174. # page_url = "https://courtyard.io/asset/a4f0bbebd858370567f1779fddf0f55630810116d80965e33940fc8ff5ac94b4"
  175. page_url = f"https://courtyard.io/asset/{detail_id}"
  176. page.get(page_url)
  177. page.wait.load_start()
  178. log.debug(f'{inspect.currentframe().f_code.co_name} -> 页面加载成功, url: {page_url}')
  179. html = page.html
  180. if not html:
  181. log.error(f'{inspect.currentframe().f_code.co_name} -> 页面加载失败...........')
  182. raise Exception('页面加载失败, 重新加载........') # 抛出异常以便重试
  183. selector = Selector(text=html)
  184. # 方法一:通过文本内容匹配(优先)
  185. correlation_spans = selector.xpath('//span[contains(text(), ":")]/text()')
  186. correlation_id = None
  187. for text_selector in correlation_spans:
  188. correlation_id = text_selector.get() # ✅ 获取字符串
  189. # match = re.search(r'[\w\s]+:\s*(\d+)', text)
  190. # if match:
  191. # correlation_id = match.group(1)
  192. # break # 获取第一个有效 ID
  193. # correlation_spans = selector.xpath('//span[contains(text(), ":")]')
  194. # correlation_text = None
  195. #
  196. # for span in correlation_spans:
  197. # text = span.get()
  198. # if text and ":" in text:
  199. # correlation_text = text
  200. # break
  201. # 如果方法一失败,使用方法二:通过结构定位(备用)
  202. if not correlation_id:
  203. correlation_span = selector.xpath('//a[contains(@href, "cgccards.com")]/preceding-sibling::span[1]/text()')
  204. correlation_id = correlation_span.get()
  205. # 初始化所有可能的字段为None
  206. data_dict = {"detail_id": detail_id, "correlation_id": correlation_id, "burn_from": None,
  207. "burn_to": None, "burn_time": None, "sale_price": None, "sale_from": None, "sale_to": None,
  208. "sale_time": None, "mint_price": None, "mint_from": None, "mint_to": None, "mint_time": None}
  209. # 提取各个标签的数据
  210. # tag_div_list = selector.xpath('//div[@class="MuiBox-root css-aylq9e"]/div')# class="MuiBox-root css-1b09fes"
  211. # 获取 "Activity history" 后面的 div
  212. activity_div = selector.xpath('//h6[text()="Activity history"]/following-sibling::div[1]/div')
  213. for tag_div in activity_div:
  214. tag_name = tag_div.xpath('./div[1]/div/span/text()').get()
  215. if not tag_name:
  216. continue
  217. if tag_name == "Burn":
  218. data_dict["burn_from"] = tag_div.xpath('./div[2]/div[1]//h6/text()').get()
  219. data_dict["burn_to"] = tag_div.xpath('./div[2]/div[2]//h6/text()').get()
  220. data_dict["burn_time"] = tag_div.xpath('./div[2]/div[3]/span/@aria-label').get()
  221. # December 3, 2025 at 4:29 PM 转换时间格式
  222. data_dict["burn_time"] = convert_time_format(data_dict["burn_time"])
  223. elif tag_name == "Sale":
  224. sale_price = tag_div.xpath('./div[2]/span/text()').get()
  225. if sale_price:
  226. sale_price = sale_price.replace("$", "").replace(",", "")
  227. data_dict["sale_price"] = sale_price
  228. data_dict["sale_from"] = tag_div.xpath('./div[3]/div[1]//h6/text()').get()
  229. data_dict["sale_to"] = tag_div.xpath('./div[3]/div[2]//h6/text()').get()
  230. data_dict["sale_time"] = tag_div.xpath('./div[3]/div[3]/span/@aria-label').get()
  231. # December 3, 2025 at 4:29 PM 转换时间格式
  232. data_dict["sale_time"] = convert_time_format(data_dict["sale_time"])
  233. elif tag_name == "Mint":
  234. mint_price = tag_div.xpath('./div[2]/span/text()').get()
  235. if mint_price:
  236. mint_price = mint_price.replace("$", "").replace(",", "")
  237. data_dict["mint_price"] = mint_price
  238. data_dict["mint_from"] = tag_div.xpath('./div[3]/div[1]//h6/text()').get()
  239. data_dict["mint_to"] = tag_div.xpath('./div[3]/div[2]//h6/text()').get()
  240. data_dict["mint_time"] = tag_div.xpath('./div[3]/div[3]/span/@aria-label').get()
  241. # December 3, 2025 at 4:29 PM 转换时间格式
  242. data_dict["mint_time"] = convert_time_format(data_dict["mint_time"])
  243. # log.info(f'Sale detail data: {data_dict}')
  244. # 保存数据
  245. sql_pool.insert_one_or_dict(table="courtyard_detail_record", data=data_dict, ignore=True)
  246. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  247. def get_sale_detail_list(log, sql_pool=None):
  248. """
  249. 获取商品详情
  250. :param log: logger对象
  251. # :param detail_id_list: 详情id列表
  252. :param sql_pool: MySQL连接池对象
  253. :return:
  254. """
  255. log.info(f"========================== 获取商品 <sale> 详情 LIST ==========================")
  256. options = ChromiumOptions()
  257. options.set_paths(local_port=9138, user_data_path=r'D:\Drissionpage_temp\courtyard_port_9138')
  258. # options.set_proxy("http://" + tunnel)
  259. # options.auto_port(True)
  260. options.no_imgs(True)
  261. # 最大化
  262. options.set_argument("--start-maximized")
  263. options.set_argument("--disable-gpu")
  264. options.set_argument("-accept-lang=en-US")
  265. page = ChromiumPage(options)
  266. try:
  267. sql_detail_id_list = sql_pool.select_all("SELECT id, detail_id FROM courtyard_list_record WHERE state = 0")
  268. for sql_detail_id in sql_detail_id_list:
  269. sql_id = sql_detail_id[0]
  270. detail_id = sql_detail_id[1]
  271. try:
  272. get_sale_detail_single_page(log, page, sql_id, detail_id, sql_pool)
  273. sql_pool.update_one("UPDATE courtyard_list_record SET state = 1 WHERE id = %s", (sql_id,))
  274. except Exception as e:
  275. log.error(f'get_sale_detail_single_page error: {e}')
  276. sql_pool.update_one("UPDATE courtyard_list_record SET state = 2 WHERE id = %s", (sql_id,))
  277. except Exception as e:
  278. log.error(f'get_response error: {e}')
  279. raise 'get_response error'
  280. finally:
  281. page.quit()
  282. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  283. def list_main(log):
  284. """
  285. 主函数 自动售货机
  286. :param log: logger对象
  287. """
  288. log.info(
  289. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  290. start = time.time()
  291. # 配置 MySQL 连接池
  292. sql_pool = MySQLConnectionPool(log=log)
  293. if not sql_pool.check_pool_health():
  294. log.error("数据库连接池异常")
  295. raise RuntimeError("数据库连接池异常")
  296. try:
  297. try:
  298. log.debug('------------------- 开始获取商品列表 -------------------')
  299. get_goods_list(log, sql_pool)
  300. except Exception as e:
  301. log.error(f'get_goods_list error: {e}')
  302. except Exception as e:
  303. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  304. finally:
  305. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  306. end = time.time()
  307. elapsed_time = end - start
  308. log.info(f'============================== 本次爬虫运行时间:{elapsed_time:.2f} 秒 ===============================')
  309. return elapsed_time
  310. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  311. def detail_main(log):
  312. """
  313. 主函数 自动售货机
  314. :param log: logger对象
  315. """
  316. log.info(
  317. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  318. # 配置 MySQL 连接池
  319. sql_pool = MySQLConnectionPool(log=log)
  320. if not sql_pool.check_pool_health():
  321. log.error("数据库连接池异常")
  322. raise RuntimeError("数据库连接池异常")
  323. global detail_first_run_completed
  324. try:
  325. # 获取详情页信息
  326. try:
  327. log.debug('------------------- 获取商品 detail 数据 -------------------')
  328. get_sale_detail_list(log, sql_pool)
  329. except Exception as e:
  330. log.error(f'get_sale_detail_list error: {e}')
  331. except Exception as e:
  332. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  333. finally:
  334. detail_first_run_completed = True
  335. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  336. def control_list_mask(log):
  337. """
  338. 控制列表爬虫任务 每10分钟运行
  339. :param log: logger对象
  340. """
  341. while True:
  342. log.info(
  343. f'--------------------- 开始运行 {inspect.currentframe().f_code.co_name} 新一轮的爬虫任务 ---------------------')
  344. elapsed_time = list_main(log)
  345. # 计算剩余时间
  346. wait_time = max(0, 300 - int(elapsed_time))
  347. if wait_time > 0:
  348. log.info(f"程序运行时间{elapsed_time:.2f}秒, 小于 5 分钟,等待 {wait_time:.2f} 秒后再开始下一轮任务")
  349. time.sleep(wait_time)
  350. else:
  351. log.info("程序运行时间大于等于5分钟,直接开始下一轮任务")
  352. def scheduled_detail_main(log):
  353. """定时任务调用的包装函数"""
  354. global detail_first_run_completed
  355. if detail_first_run_completed:
  356. detail_main(log)
  357. else:
  358. log.info("Skipping scheduled task as first run is not completed yet")
  359. def run_threaded(job_func, *args, **kwargs):
  360. """
  361. 在新线程中运行给定的函数,并传递参数。
  362. :param job_func: 要运行的目标函数
  363. :param args: 位置参数
  364. :param kwargs: 关键字参数
  365. """
  366. job_thread = threading.Thread(target=job_func, args=args, kwargs=kwargs)
  367. job_thread.start()
  368. def schedule_task():
  369. """
  370. 设置定时任务
  371. """
  372. # 启动 control_list_mask 任务线程
  373. list_thread = threading.Thread(target=control_list_mask, args=(logger,))
  374. list_thread.daemon = True # 设置为守护线程,主程序退出时自动结束
  375. list_thread.start()
  376. # 启动 detail_main 任务线程(首次运行)
  377. detail_thread = threading.Thread(target=detail_main, args=(logger,))
  378. detail_thread.daemon = True
  379. detail_thread.start()
  380. # 设置定时任务 每天
  381. # schedule.every().day.at("00:01").do(run_threaded, detail_main, logger)
  382. schedule.every().day.at("00:01").do(run_threaded, scheduled_detail_main, logger)
  383. while True:
  384. schedule.run_pending()
  385. time.sleep(1)
  386. if __name__ == '__main__':
  387. schedule_task()
  388. # detail_main(log=logger)
  389. # get_sale_detail_list(log, ((1, 'a4f0bbebd858370567f1779fddf0f55630810116d80965e33940fc8ff5ac94b4'),
  390. # (2, 'a4f0bbebd858370567f1779fddf0f55630810116d80965e33940fc8ff5ac94b4')), sql_pool)