clove_lucky_bag_spider.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/7/31 11:33
  5. import re
  6. import time
  7. import requests
  8. import inspect
  9. import schedule
  10. from loguru import logger
  11. from parsel import Selector
  12. from tenacity import retry, stop_after_attempt, wait_fixed
  13. from mysql_pool import MySQLConnectionPool
  14. from DrissionPage import ChromiumPage, ChromiumOptions
  15. logger.remove()
  16. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  17. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  18. level="DEBUG", retention="7 day")
  19. def after_log(retry_state):
  20. """
  21. retry 回调
  22. :param retry_state: RetryCallState 对象
  23. """
  24. # 检查 args 是否存在且不为空
  25. if retry_state.args and len(retry_state.args) > 0:
  26. log = retry_state.args[0] # 获取传入的 logger
  27. else:
  28. log = logger # 使用全局 logger
  29. if retry_state.outcome.failed:
  30. log.warning(
  31. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  32. else:
  33. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  34. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  35. def get_response(log, page_url) -> (None, None):
  36. """
  37. 获取页面源码
  38. :param log: log 对象
  39. :param page_url: 页面 URL
  40. :return: 页面源码, tag_turn_href
  41. """
  42. options = ChromiumOptions().set_paths(local_port=9131, user_data_path=r'D:\Drissionpage_temp\local_port_9131')
  43. # options.set_argument("--disable-gpu")
  44. options.set_argument("-accept-lang=en-US")
  45. page = ChromiumPage(options)
  46. try:
  47. page.get(page_url)
  48. # page.wait.load_start() # 等待页面进入加载状态
  49. # page_title = page.title.lower()
  50. # if "just a moment" or "请稍候" in page_title:
  51. # cf_bypasser = CloudflareBypasser(page, max_retries=5, log=log)
  52. # cf_bypasser.bypass()
  53. # page.wait.load_start() # 等待页面进入加载状态
  54. html = page.html
  55. if html:
  56. # print(html)
  57. # parse_data(html)
  58. return html
  59. else:
  60. log.error('页面加载失败')
  61. raise '页面加载失败, 重新加载........'
  62. except Exception as e:
  63. log.error(f'get_response error: {e}')
  64. raise 'get_response error'
  65. finally:
  66. page.quit()
  67. def get_lucky_bag_list(log, category_name, sql_pool):
  68. page = 1
  69. while page <= 500:
  70. try:
  71. log.debug(
  72. f'--------------- {inspect.currentframe().f_code.co_name}, page {page}, category_name {category_name} start ---------------')
  73. len_items = get_lucky_bag_single_page(log, category_name, page, sql_pool)
  74. except Exception as e:
  75. log.error(
  76. f"{inspect.currentframe().f_code.co_name} Request get_shop_product_sold_single_page for page:{page}, {e}")
  77. len_items = 0
  78. if len_items < 72:
  79. log.debug(f'--------------- page {page} has {len_items} items, break ---------------')
  80. break
  81. page += 1
  82. # 设置等待时间 避免查询太频繁
  83. # time.sleep(random.uniform(0.5, 1))
  84. def get_lucky_bag_single_page(log, category_name, page, sql_pool):
  85. log.debug(
  86. f"{inspect.currentframe().f_code.co_name} Request category_name:{category_name}, page:{page}................")
  87. # url = "https://store.clove.jp/jp/categories/pokemon?page=1"
  88. url = f"https://store.clove.jp/jp/categories/{category_name}?page={page}"
  89. # response = requests.get(url, headers=headers)
  90. response_text = get_response(log, url)
  91. # print(response.text)
  92. # with open("clove_list.html", "w", encoding="utf-8") as f:
  93. # f.write(response.text)
  94. # print(response)
  95. # selector = Selector(text=response.text)
  96. # tag_li_list = selector.xpath('//div[@class="w-full"]/ul/li')
  97. # # print(tag_li_list)
  98. # for tag_li in tag_li_list:
  99. # print(tag_li)
  100. # # title = tag_li.xpath('./div/a/div/div[2]/div[1]/p[1]/font/font/text()').get()
  101. # title = tag_li.xpath('./div/a/div/div[2]/div[1]/p[1]//text()').get()
  102. # detail_href = tag_li.xpath('./div/a/@href').get()
  103. # detail_href = 'https://store.clove.jp' + detail_href if detail_href else ""
  104. # # 将图片链接 的 w和q 改为 1200 和75
  105. # """
  106. # src="https://store.clove.jp/_next/image?url=https%3A%2F%2Fstorage.googleapis.com%2Fclove-admin-public-resources%2Fcollectibles%2Fcm4aukaqi00k8s601pg63bli1&w=1200&q=75"
  107. # src=" /_next/image?url=https%3A%2F%2Fstorage.googleapis.com%2Fclove-admin-public-resources%2Fcollectibles%2Fcm4aukaqi00k8s601pg63bli1&w=3840&q=50"
  108. # """
  109. # image_url = tag_li.xpath('./div/a/div/div[1]//img/@src').get()
  110. # image_url = 'https://store.clove.jp' + image_url if image_url else ""
  111. # image_url = image_url.replace("w=3840", "w=1200").replace("q=50", "q=75") if image_url else ""
  112. # card_number = tag_li.xpath('./div/a/div/div[2]/div[1]/p[1]/font/font/text()').get()
  113. # price = tag_li.xpath('./div/a/div/div[2]/div[2]/div/div/p[2]//text()').getall()
  114. # price = "".join(price).strip() if price else None
  115. #
  116. # inventory = tag_li.xpath('./div/a/div/div[2]/div[2]/p/font[2]/font/text()').get()
  117. # data_dict = {
  118. # "category": category_name,
  119. # "title": title,
  120. # "detail_href": detail_href,
  121. # "image_url": image_url,
  122. # "card_number": card_number,
  123. # "price": price,
  124. # "inventory": inventory,
  125. # }
  126. # print(data_dict)
  127. selector = Selector(text=response_text)
  128. len_items = parse_products_from_html(log, selector, category_name, sql_pool)
  129. return len_items
  130. def parse_products_from_html(log, selector, category, sql_pool):
  131. """使用XPath从HTML中提取商品信息"""
  132. # 查找所有商品预览div
  133. tag_div_list = selector.xpath('//div[@data-sentry-component="ProductPreview"]')
  134. log.debug(f"找到 {len(tag_div_list)} 个商品")
  135. # # 如果找不到ProductPreview组件,尝试查找其他可能的商品元素
  136. # if len(tag_div_list) == 0:
  137. # # 尝试查找li元素作为备选方案
  138. # tag_div_list = selector.xpath('//ul[@data-testid="products-list-loader"]/li')
  139. # print(f"备选方案找到 {len(tag_div_list)} 个商品占位符")
  140. #
  141. # # 如果仍然找不到,尝试从script标签中解析商品数据
  142. # if len(tag_div_list) == 0:
  143. # print("未找到商品元素,尝试从script标签中解析数据...")
  144. # # 使用你已有的parse_products_from_script函数来解析
  145. # return
  146. info_list = []
  147. for tag_div in tag_div_list:
  148. # 提取商品信息
  149. title = tag_div.xpath('.//p[@data-testid="product-title"]/text()').get()
  150. # subtitle可能包含卡片编号信息,如"117/139"
  151. subtitle = tag_div.xpath('.//p[contains(@class, "text-ui-fg-subtle")]/text()').get()
  152. detail_href = tag_div.xpath('./a/@href').get()
  153. detail_href = 'https://store.clove.jp' + detail_href if detail_href else ""
  154. # 提取价格信息 - 尝试多种可能的XPath路径
  155. # price_elements = tag_div.xpath('.//p[contains(text(), "¥")]/following-sibling::p[1]//text()').getall()
  156. # price_elements = tag_div.xpath('.//p[contains(@class, "font-bold") and contains(@class, "text-base")]//text()').getall()
  157. # if not price_elements:
  158. # # 如果上面的方法找不到,尝试其他可能的XPath路径
  159. # price_elements = tag_div.xpath(
  160. # './/div[@data-sentry-component="PreviewPrice"]//p//text() | .//a//div[@data-sentry-component="PreviewPrice"]//text()'
  161. # ).getall()
  162. price_elements = tag_div.xpath(
  163. './/div[@data-sentry-component="PreviewPrice"]//p//text() | .//a//div[@data-sentry-component="PreviewPrice"]//text()'
  164. ).getall()
  165. # print(price_elements)
  166. price = ''.join(price_elements).strip() if price_elements else None
  167. # 清理价格数据,移除货币符号和逗号
  168. if price:
  169. price = re.sub(r'[¥,]', '', price).strip()
  170. # 提取库存信息 - 增加错误处理
  171. inventory_text = tag_div.xpath('.//p[contains(text(), "在庫数")]//text()').getall()
  172. inventory = None
  173. if inventory_text and len(inventory_text) > 1:
  174. inventory = inventory_text[1].strip()
  175. elif inventory_text:
  176. # 如果只找到一个文本节点,尝试提取其中的数字
  177. inventory_full_text = ''.join(inventory_text)
  178. inventory_match = re.search(r'\d+', inventory_full_text)
  179. if inventory_match:
  180. inventory = inventory_match.group()
  181. # 提取图片链接
  182. image_url = tag_div.xpath('.//div[@data-sentry-component="ImageOrPlaceholder"]//img/@src').get()
  183. image_url = 'https://store.clove.jp' + image_url if image_url else ""
  184. image_url = image_url.replace("w=3840", "w=1200").replace("q=50", "q=75") if image_url else ""
  185. data_dict = {
  186. "title": title,
  187. "subtitle": subtitle, # 类似"117/139"的卡片编号信息
  188. "detail_href": detail_href,
  189. "image_url": image_url,
  190. "price": price,
  191. "inventory": inventory, # 库存
  192. "category": category
  193. }
  194. # print(data_dict)
  195. info_list.append(data_dict)
  196. # 插入数据库
  197. if info_list:
  198. try:
  199. sql_pool.insert_many(table="clove_lucky_bag_list_record", data_list=info_list, ignore=True)
  200. # sql = "INSERT IGNORE INTO clove_lucky_bag_list_record (pid, category, title, price, publish_status, quantity, remaining, image, sub_image, open_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
  201. # sql_pool.insert_all(sql, info_list)
  202. except Exception as e:
  203. log.warning(f"{inspect.currentframe().f_code.co_name}, {e[:500]}")
  204. return len(tag_div_list)
  205. def get_detail(log, sql_id, detail_url, sql_pool):
  206. headers = {
  207. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  208. "referer": "https://store.clove.jp/jp/categories/lorcana",
  209. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"
  210. }
  211. # url = "https://store.clove.jp/jp/products/cm8cv711v00b4s60197g7tikc"
  212. response = requests.get(detail_url, headers=headers)
  213. # print(response.text)
  214. # print(response.status_code)
  215. selector = Selector(response.text)
  216. # tag_div_list = selector.xpath('//div[@class="grid gap-y-3"]/div')
  217. # for tag_div in tag_div_list:
  218. first_inventory = selector.xpath('//div[@class="grid gap-y-3"]/div[1]/div[1]/p/text()').getall()
  219. first_inventory = ''.join(first_inventory).strip() if first_inventory else None
  220. first_inventory = first_inventory.replace('在庫: ', '').replace('点', '') if first_inventory else None
  221. first_price = selector.xpath('//div[@class="grid gap-y-3"]/div[1]//span[@data-testid="product-price"]/text()').get()
  222. second_inventory = selector.xpath('//div[@class="grid gap-y-3"]/div[2]/div[1]/p/text()').getall()
  223. second_inventory = ''.join(second_inventory).strip() if second_inventory else None
  224. second_inventory = second_inventory.replace('在庫: ', '').replace('点', '') if second_inventory else None
  225. second_price = selector.xpath(
  226. '//div[@class="grid gap-y-3"]/div[2]//span[@data-testid="product-price"]/text()').get()
  227. third_inventory = selector.xpath('//div[@class="grid gap-y-3"]/div[3]/div[1]/p/text()').getall()
  228. third_inventory = ''.join(third_inventory).strip() if third_inventory else None
  229. third_inventory = third_inventory.replace('在庫: ', '').replace('点', '') if third_inventory else None
  230. third_price = selector.xpath('//div[@class="grid gap-y-3"]/div[3]//span[@data-testid="product-price"]/text()').get()
  231. data_dict = {
  232. "first_inventory": first_inventory,
  233. "first_price": first_price,
  234. "second_inventory": second_inventory,
  235. "second_price": second_price,
  236. "third_inventory": third_inventory,
  237. "third_price": third_price
  238. }
  239. # print(data_dict)
  240. try:
  241. sql_pool.update_one_or_dict(table="clove_lucky_bag_list_record", data=data_dict, condition={"id": sql_id})
  242. # 更新任务状态为 1
  243. # sql_pool.update_one(table="clove_lucky_bag_list_record", data={"state": 1}, condition={"pid": sql_id})
  244. sql_pool.update_one(f"update clove_lucky_bag_list_record set state=1 where id={sql_id}")
  245. except Exception as e:
  246. log.warning(f"{inspect.currentframe().f_code.co_name}, {e[:500]}")
  247. # sql_pool.update_one_or_dict(table="clove_lucky_bag_list_record", data={"state": 3},
  248. # condition={"id": sql_id})
  249. sql_pool.update_one(f"update clove_lucky_bag_list_record set state=3 where id={sql_id}")
  250. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  251. def lucky_main(log):
  252. """
  253. 主函数
  254. :param log: logger对象
  255. """
  256. log.info(
  257. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务.................................................')
  258. # 配置 MySQL 连接池
  259. sql_pool = MySQLConnectionPool(log=log)
  260. if not sql_pool.check_pool_health():
  261. log.error("数据库连接池异常")
  262. raise RuntimeError("数据库连接池异常")
  263. try:
  264. category_list = ["pokemon", "onepiece", "duel-masters", "lorcana", "fab"]
  265. for category in category_list:
  266. try:
  267. get_lucky_bag_list(log, category, sql_pool)
  268. except Exception as e2:
  269. log.error(f"Request get_lucky_bag_list error: {e2}")
  270. # 获取详情页数据
  271. sql_result = sql_pool.select_all("select id, detail_href from clove_lucky_bag_list_record where state=0")
  272. if sql_result and not isinstance(sql_result, Exception):
  273. for row in sql_result:
  274. try:
  275. pid, detail_href = row
  276. log.debug(f"{inspect.currentframe().f_code.co_name} 获取 pid: {pid} 详情..............")
  277. get_detail(log, pid, detail_href, sql_pool)
  278. except Exception as e:
  279. log.error(f"Request get_detail error: {e}")
  280. except Exception as e:
  281. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  282. finally:
  283. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  284. def schedule_task():
  285. """
  286. 爬虫模块 定时任务 的启动文件
  287. """
  288. # 立即运行一次任务
  289. lucky_main(log=logger)
  290. # 设置定时任务
  291. schedule.every().day.at("00:01").do(lucky_main, log=logger)
  292. while True:
  293. schedule.run_pending()
  294. time.sleep(1)
  295. if __name__ == '__main__':
  296. # get_lucky_bag_single_page(logger, 'pokemon')
  297. # lucky_main(log=logger)
  298. # get_detail(log=logger)
  299. schedule_task()