jp_pokemon_card_spider.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/8/19 15:58
  5. import re
  6. import json
  7. import inspect
  8. import requests
  9. import user_agent
  10. from loguru import logger
  11. from parsel import Selector
  12. from mysql_pool import MySQLConnectionPool
  13. from tenacity import retry, stop_after_attempt, wait_fixed
  14. # logger.remove()
  15. # logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  16. # format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  17. # level="DEBUG", retention="7 day")
  18. crawler_language = "jp"
  19. class JPPokemonCardSpider:
  20. # 类变量
  21. BASE_URL = "https://www.pokemon-card.com"
  22. def __init__(self, log=None):
  23. self.log = log or logger
  24. self.headers = {
  25. "accept": "application/json, text/javascript, */*; q=0.01",
  26. "user-agent": user_agent.generate_user_agent()
  27. }
  28. @staticmethod
  29. def _after_log(retry_state):
  30. """
  31. retry 回调 - 静态方法
  32. :param retry_state: RetryCallState 对象
  33. """
  34. # 检查 args 是否存在且不为空
  35. if retry_state.args and len(retry_state.args) > 0:
  36. log = retry_state.args[0] # 获取传入的 logger
  37. else:
  38. log = logger # 使用全局 logger
  39. if retry_state.outcome.failed:
  40. log.warning(
  41. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  42. else:
  43. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  44. @staticmethod
  45. def _parse_html_json(html_content, log):
  46. """
  47. 解析网页源码, 获取json数据 - 静态方法
  48. :param html_content: 网页源码
  49. :param log: logger对象
  50. :return: json数据
  51. """
  52. log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
  53. # 查找PTC.master.uiData的开始位置
  54. start_marker = 'PTC.master.uiData = '
  55. start_pos = html_content.find(start_marker)
  56. if start_pos == -1:
  57. log.debug("PTC.master.uiData not found")
  58. return None
  59. # 从开始标记后的位置查找第一个 {
  60. start_pos = html_content.find('{', start_pos + len(start_marker))
  61. if start_pos == -1:
  62. log.debug("Opening brace not found")
  63. return None
  64. # 查找匹配的大括号
  65. brace_count = 1
  66. pos = start_pos + 1
  67. while pos < len(html_content) and brace_count > 0:
  68. if html_content[pos] == '{':
  69. brace_count += 1
  70. elif html_content[pos] == '}':
  71. brace_count -= 1
  72. pos += 1
  73. if brace_count == 0:
  74. # 提取完整的JavaScript对象
  75. js_object_str = html_content[start_pos:pos]
  76. # 转换为有效的JSON
  77. try:
  78. # 添加引号到键,移除注释等
  79. json_str = js_object_str
  80. # 为键添加引号
  81. json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1 "\2":', json_str)
  82. # 移除注释
  83. json_str = re.sub(r'//.*?(\n|$)', r'\1', json_str)
  84. json_str = re.sub(r'/\*.*?\*/', '', json_str, flags=re.DOTALL)
  85. # 移除尾随逗号
  86. json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
  87. ui_data = json.loads(json_str)
  88. return ui_data
  89. except json.JSONDecodeError as e:
  90. log.error(f"{inspect.currentframe().f_code.co_name}Failed to decode JSON: {e}")
  91. else:
  92. log.warning(f"{inspect.currentframe().f_code.co_name} Could not find complete JavaScript object")
  93. return None
  94. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=_after_log)
  95. def get_classification_list(self, sql_pool):
  96. """
  97. 获取分类列表
  98. :param sql_pool: MySQL连接池对象
  99. """
  100. self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
  101. url = f"{self.BASE_URL}/card-search/index.php"
  102. params = {
  103. "keyword": "",
  104. "se_ta": "",
  105. "regulation_sidebar_form": "all",
  106. "pg": "",
  107. "illust": "",
  108. "sm_and_keyword": "true"
  109. }
  110. response = requests.get(url, headers=self.headers, params=params, timeout=10)
  111. json_ui_data = self._parse_html_json(response.text, self.log)
  112. pg_list = json_ui_data.get("pg", {}).get("list", [])
  113. if pg_list:
  114. for pg in pg_list[1:]:
  115. pg_value = pg.get("value")
  116. pg_label = pg.get("label")
  117. self.log.info(f"pg_list -> pg_value:{pg_value}, pg_label: {pg_label}")
  118. self.get_pokemon_card_list(pg_value, pg_label, sql_pool)
  119. else:
  120. self.log.debug(
  121. f"{inspect.currentframe().f_code.co_name} NOt found pg_list !!! get_classification_list end.....................")
  122. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=_after_log)
  123. def _get_pokemon_card_single_page(self, pg_value, page=1):
  124. """
  125. 获取指定分类和页码的卡片列表
  126. """
  127. self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
  128. url = f"{self.BASE_URL}/card-search/resultAPI.php"
  129. params = {
  130. "keyword": "",
  131. "se_ta": "",
  132. "regulation_sidebar_form": "all",
  133. "pg": pg_value,
  134. "illust": "",
  135. "sm_and_keyword": "true",
  136. "page": str(page)
  137. }
  138. response = requests.get(url, headers=self.headers, params=params, timeout=10)
  139. response.raise_for_status()
  140. resp_json = response.json()
  141. return resp_json
  142. def _parse_pokemon_card_list(self, card_list, pg_value, pg_label, sql_pool):
  143. """
  144. 解析卡片列表,获取卡片信息
  145. """
  146. self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
  147. card_info_list = []
  148. for card in card_list:
  149. card_id = card.get("cardID")
  150. card_name = card.get("cardNameViewText")
  151. if '<span' in card_name:
  152. card_name = card_name.replace('<span class="pcg pcg-prismstar"></span>', '').replace(
  153. '<span class="pcg pcg-megamark"></span>', '')
  154. card_thumb = card.get("cardThumbFile")
  155. card_thumb = f'{self.BASE_URL}{card_thumb}'
  156. data_dict = {
  157. "card_id": card_id,
  158. "card_name": card_name,
  159. "img": card_thumb,
  160. "pg_value": pg_value,
  161. "pg_label": pg_label,
  162. "crawler_language": crawler_language
  163. }
  164. card_info_list.append(data_dict)
  165. if card_info_list:
  166. sql_pool.insert_many(table="pokemon_card_record", data_list=card_info_list, ignore=True)
  167. def get_pokemon_card_list(self, pg_value='', pg_label='', sql_pool=None):
  168. """
  169. 获取指定分类下的所有卡片列表 翻页
  170. """
  171. self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
  172. page = 1
  173. max_page = 1
  174. while page <= max_page:
  175. self.log.debug(f"正在获取第 {page} 页数据, pg_label: {pg_label} .........")
  176. page_data = self._get_pokemon_card_single_page(pg_value, page)
  177. if page_data.get("result") == 1:
  178. # 更新max_page(仅在第一页时需要更新)
  179. if page == 1:
  180. max_page = page_data.get("maxPage", 1)
  181. self.log.info(f"分类 {pg_label} 共有 {max_page} 页数据")
  182. cardList = page_data.get("cardList", [])
  183. if not cardList:
  184. self.log.warning(f"{inspect.currentframe().f_code.co_name} NOt found cardList !!!")
  185. break
  186. try:
  187. self._parse_pokemon_card_list(cardList, pg_value, pg_label, sql_pool)
  188. except Exception as e:
  189. self.log.error(f"{inspect.currentframe().f_code.co_name} parse_pokemon_card_list error: {e}")
  190. if len(cardList) < 39:
  191. self.log.debug(f"{inspect.currentframe().f_code.co_name} 获取的卡片数量小于39 !!! 停止翻页")
  192. break
  193. if page >= max_page:
  194. self.log.debug(
  195. f"{inspect.currentframe().f_code.co_name} -> page: {page}, max_page: {max_page}, 停止翻页")
  196. break
  197. if page >= 10:
  198. self.log.debug(
  199. f"{inspect.currentframe().f_code.co_name} -> page: {page}, page >= 10, 停止翻页.......")
  200. break
  201. page += 1
  202. else:
  203. self.log.warning(f"获取第 {page} 页数据失败: {page_data.get('errMsg')}")
  204. break
  205. def get_details(self, item_id, sql_pool):
  206. """
  207. 获取商品详情
  208. """
  209. self.log.debug(f"{inspect.currentframe().f_code.co_name} start, item_id: {item_id}.....................")
  210. url = f'{self.BASE_URL}/card-search/details.php/card/{item_id}'
  211. response = requests.get(url, headers=self.headers, timeout=10)
  212. response.raise_for_status()
  213. selector = Selector(response.text)
  214. card_no_list = selector.xpath('//div[@class="subtext Text-fjalla"]/text()').getall()
  215. card_no = ''.join(card_no_list)
  216. card_no = card_no.strip().replace('\xa0', '') if card_no else None
  217. tag_ic_rare = selector.xpath(
  218. '//div[@class="subtext Text-fjalla"]/img[not(contains(@class, "img-regulation"))]/@src').get()
  219. ic_rare_sp = tag_ic_rare.split('/')[-1].split('.')[0] if tag_ic_rare else None
  220. if ic_rare_sp and '_' in ic_rare_sp:
  221. ic_rare = ic_rare_sp.split('_')[-1]
  222. else:
  223. ic_rare = ic_rare_sp
  224. data_dict = {
  225. "card_no": card_no,
  226. "rarity": ic_rare
  227. }
  228. sql_pool.update_one_or_dict(
  229. table="pokemon_card_record",
  230. data=data_dict,
  231. condition={"card_id": item_id}
  232. )
  233. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=_after_log)
  234. def run(self):
  235. """
  236. 主函数
  237. """
  238. self.log.info(
  239. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  240. # 配置 MySQL 连接池
  241. sql_pool = MySQLConnectionPool(log=self.log)
  242. if not sql_pool.check_pool_health():
  243. self.log.error("数据库连接池异常")
  244. raise RuntimeError("数据库连接池异常")
  245. try:
  246. # 1. 获取已售出商品列表 按系列获取 先获取这个!!!
  247. self.log.debug(f"........... 开始获取已售出商品列表 按系列获取 ..........")
  248. try:
  249. self.get_classification_list(sql_pool)
  250. except Exception as e:
  251. self.log.error(f"Request get_classification_list error: {e}")
  252. # 2. 获取商品列表 所有 去重
  253. self.log.debug(f"........... 获取商品列表 所有 去重 ..........")
  254. try:
  255. self.get_pokemon_card_list(sql_pool=sql_pool)
  256. except Exception as e:
  257. self.log.error(f"Request get_pokemon_card_list error: {e}")
  258. # 获取商品详情
  259. self.log.debug(f"........... 获取商品详情 ..........")
  260. sql_ietm_id_list = sql_pool.select_all(
  261. f"SELECT card_id FROM pokemon_card_record WHERE card_no IS NULL AND crawler_language='{crawler_language}'")
  262. sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
  263. for item_id in sql_ietm_id_list:
  264. try:
  265. self.get_details(item_id, sql_pool)
  266. except Exception as e:
  267. self.log.error(f"Request get_details error: {e}")
  268. except Exception as e:
  269. self.log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  270. finally:
  271. self.log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  272. # def schedule_task():
  273. # """
  274. # 爬虫模块 定时任务 的启动文件
  275. # """
  276. # # 创建爬虫实例
  277. # spider = JPPokemonCardSpider()
  278. #
  279. # # 立即运行一次任务
  280. # # spider.run()
  281. #
  282. # # 设置定时任务
  283. # schedule.every().day.at("01:06").do(spider.run)
  284. #
  285. # while True:
  286. # schedule.run_pending()
  287. # time.sleep(1)
  288. if __name__ == '__main__':
  289. # schedule_task()
  290. spider = JPPokemonCardSpider()
  291. spider.run()