# -*- coding: utf-8 -*-
# Author : Charley
# Python : 3.10.8
# Date : 2025/8/19 15:58
import re
import json
import inspect
import requests
import user_agent
from loguru import logger
from parsel import Selector
from mysql_pool import MySQLConnectionPool
from tenacity import retry, stop_after_attempt, wait_fixed
# logger.remove()
# logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
# format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
# level="DEBUG", retention="7 day")
crawler_language = "jp"
class JPPokemonCardSpider:
# 类变量
BASE_URL = "https://www.pokemon-card.com"
def __init__(self, log=None):
self.log = log or logger
self.headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"user-agent": user_agent.generate_user_agent()
}
@staticmethod
def _after_log(retry_state):
"""
retry 回调 - 静态方法
:param retry_state: RetryCallState 对象
"""
# 检查 args 是否存在且不为空
if retry_state.args and len(retry_state.args) > 0:
log = retry_state.args[0] # 获取传入的 logger
else:
log = logger # 使用全局 logger
if retry_state.outcome.failed:
log.warning(
f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
else:
log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
@staticmethod
def _parse_html_json(html_content, log):
"""
解析网页源码, 获取json数据 - 静态方法
:param html_content: 网页源码
:param log: logger对象
:return: json数据
"""
log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
# 查找PTC.master.uiData的开始位置
start_marker = 'PTC.master.uiData = '
start_pos = html_content.find(start_marker)
if start_pos == -1:
log.debug("PTC.master.uiData not found")
return None
# 从开始标记后的位置查找第一个 {
start_pos = html_content.find('{', start_pos + len(start_marker))
if start_pos == -1:
log.debug("Opening brace not found")
return None
# 查找匹配的大括号
brace_count = 1
pos = start_pos + 1
while pos < len(html_content) and brace_count > 0:
if html_content[pos] == '{':
brace_count += 1
elif html_content[pos] == '}':
brace_count -= 1
pos += 1
if brace_count == 0:
# 提取完整的JavaScript对象
js_object_str = html_content[start_pos:pos]
# 转换为有效的JSON
try:
# 添加引号到键,移除注释等
json_str = js_object_str
# 为键添加引号
json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1 "\2":', json_str)
# 移除注释
json_str = re.sub(r'//.*?(\n|$)', r'\1', json_str)
json_str = re.sub(r'/\*.*?\*/', '', json_str, flags=re.DOTALL)
# 移除尾随逗号
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
ui_data = json.loads(json_str)
return ui_data
except json.JSONDecodeError as e:
log.error(f"{inspect.currentframe().f_code.co_name}Failed to decode JSON: {e}")
else:
log.warning(f"{inspect.currentframe().f_code.co_name} Could not find complete JavaScript object")
return None
@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=_after_log)
def get_classification_list(self, sql_pool):
"""
获取分类列表
:param sql_pool: MySQL连接池对象
"""
self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
url = f"{self.BASE_URL}/card-search/index.php"
params = {
"keyword": "",
"se_ta": "",
"regulation_sidebar_form": "all",
"pg": "",
"illust": "",
"sm_and_keyword": "true"
}
response = requests.get(url, headers=self.headers, params=params, timeout=10)
json_ui_data = self._parse_html_json(response.text, self.log)
pg_list = json_ui_data.get("pg", {}).get("list", [])
if pg_list:
for pg in pg_list[1:]:
pg_value = pg.get("value")
pg_label = pg.get("label")
self.log.info(f"pg_list -> pg_value:{pg_value}, pg_label: {pg_label}")
self.get_pokemon_card_list(pg_value, pg_label, sql_pool)
else:
self.log.debug(
f"{inspect.currentframe().f_code.co_name} NOt found pg_list !!! get_classification_list end.....................")
@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=_after_log)
def _get_pokemon_card_single_page(self, pg_value, page=1):
"""
获取指定分类和页码的卡片列表
"""
self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
url = f"{self.BASE_URL}/card-search/resultAPI.php"
params = {
"keyword": "",
"se_ta": "",
"regulation_sidebar_form": "all",
"pg": pg_value,
"illust": "",
"sm_and_keyword": "true",
"page": str(page)
}
response = requests.get(url, headers=self.headers, params=params, timeout=10)
response.raise_for_status()
resp_json = response.json()
return resp_json
def _parse_pokemon_card_list(self, card_list, pg_value, pg_label, sql_pool):
"""
解析卡片列表,获取卡片信息
"""
self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
card_info_list = []
for card in card_list:
card_id = card.get("cardID")
card_name = card.get("cardNameViewText")
if '', '').replace(
'', '')
card_thumb = card.get("cardThumbFile")
card_thumb = f'{self.BASE_URL}{card_thumb}'
data_dict = {
"card_id": card_id,
"card_name": card_name,
"img": card_thumb,
"pg_value": pg_value,
"pg_label": pg_label,
"crawler_language": crawler_language
}
card_info_list.append(data_dict)
if card_info_list:
sql_pool.insert_many(table="pokemon_card_record", data_list=card_info_list, ignore=True)
def get_pokemon_card_list(self, pg_value='', pg_label='', sql_pool=None):
"""
获取指定分类下的所有卡片列表 翻页
"""
self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
page = 1
max_page = 1
while page <= max_page:
self.log.debug(f"正在获取第 {page} 页数据, pg_label: {pg_label} .........")
page_data = self._get_pokemon_card_single_page(pg_value, page)
if page_data.get("result") == 1:
# 更新max_page(仅在第一页时需要更新)
if page == 1:
max_page = page_data.get("maxPage", 1)
self.log.info(f"分类 {pg_label} 共有 {max_page} 页数据")
cardList = page_data.get("cardList", [])
if not cardList:
self.log.warning(f"{inspect.currentframe().f_code.co_name} NOt found cardList !!!")
break
try:
self._parse_pokemon_card_list(cardList, pg_value, pg_label, sql_pool)
except Exception as e:
self.log.error(f"{inspect.currentframe().f_code.co_name} parse_pokemon_card_list error: {e}")
if len(cardList) < 39:
self.log.debug(f"{inspect.currentframe().f_code.co_name} 获取的卡片数量小于39 !!! 停止翻页")
break
if page >= max_page:
self.log.debug(
f"{inspect.currentframe().f_code.co_name} -> page: {page}, max_page: {max_page}, 停止翻页")
break
if page >= 10:
self.log.debug(
f"{inspect.currentframe().f_code.co_name} -> page: {page}, page >= 10, 停止翻页.......")
break
page += 1
else:
self.log.warning(f"获取第 {page} 页数据失败: {page_data.get('errMsg')}")
break
def get_details(self, item_id, sql_pool):
"""
获取商品详情
"""
self.log.debug(f"{inspect.currentframe().f_code.co_name} start, item_id: {item_id}.....................")
url = f'{self.BASE_URL}/card-search/details.php/card/{item_id}'
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
selector = Selector(response.text)
card_no_list = selector.xpath('//div[@class="subtext Text-fjalla"]/text()').getall()
card_no = ''.join(card_no_list)
card_no = card_no.strip().replace('\xa0', '') if card_no else None
tag_ic_rare = selector.xpath(
'//div[@class="subtext Text-fjalla"]/img[not(contains(@class, "img-regulation"))]/@src').get()
ic_rare_sp = tag_ic_rare.split('/')[-1].split('.')[0] if tag_ic_rare else None
if ic_rare_sp and '_' in ic_rare_sp:
ic_rare = ic_rare_sp.split('_')[-1]
else:
ic_rare = ic_rare_sp
data_dict = {
"card_no": card_no,
"rarity": ic_rare
}
sql_pool.update_one_or_dict(
table="pokemon_card_record",
data=data_dict,
condition={"card_id": item_id}
)
@retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=_after_log)
def run(self):
"""
主函数
"""
self.log.info(
f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
# 配置 MySQL 连接池
sql_pool = MySQLConnectionPool(log=self.log)
if not sql_pool.check_pool_health():
self.log.error("数据库连接池异常")
raise RuntimeError("数据库连接池异常")
try:
# 1. 获取已售出商品列表 按系列获取 先获取这个!!!
self.log.debug(f"........... 开始获取已售出商品列表 按系列获取 ..........")
try:
self.get_classification_list(sql_pool)
except Exception as e:
self.log.error(f"Request get_classification_list error: {e}")
# 2. 获取商品列表 所有 去重
self.log.debug(f"........... 获取商品列表 所有 去重 ..........")
try:
self.get_pokemon_card_list(sql_pool=sql_pool)
except Exception as e:
self.log.error(f"Request get_pokemon_card_list error: {e}")
# 获取商品详情
self.log.debug(f"........... 获取商品详情 ..........")
sql_ietm_id_list = sql_pool.select_all(
f"SELECT card_id FROM pokemon_card_record WHERE card_no IS NULL AND crawler_language='{crawler_language}'")
sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
for item_id in sql_ietm_id_list:
try:
self.get_details(item_id, sql_pool)
except Exception as e:
self.log.error(f"Request get_details error: {e}")
except Exception as e:
self.log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
finally:
self.log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
# def schedule_task():
# """
# 爬虫模块 定时任务 的启动文件
# """
# # 创建爬虫实例
# spider = JPPokemonCardSpider()
#
# # 立即运行一次任务
# # spider.run()
#
# # 设置定时任务
# schedule.every().day.at("01:06").do(spider.run)
#
# while True:
# schedule.run_pending()
# time.sleep(1)
if __name__ == '__main__':
# schedule_task()
spider = JPPokemonCardSpider()
spider.run()