Quellcode durchsuchen

refactor(pokemon_tcg_spider): 删除冗余旧版爬虫代码,优化现有爬虫逻辑

- 移除 fan_pokemon_card_spider_111.py、jp_pokemon_card_spider111.py、jp_pokemon_card_spider222.py 文件,清理不再使用的爬虫代码
- 更新 jian_pokemon_card_spider.py,增加系列过滤逻辑,过滤“周边”系列数据
- 优化查询商品详情时的时间条件,只查询当天创建的商品详情,减少无效请求
- 修正 jp_pokemon_card_spider.py 中获取卡片名称时的特殊字符处理
- 取消注释并修正 jp_pokemon_card_spider.py 内获取商品列表的部分业务逻辑,确保流程正常执行
charley vor 1 Monat
Ursprung
Commit
fc43171ab9

+ 0 - 286
pokemon_tcg_spider/fan_pokemon_card_spider_111.py

@@ -1,286 +0,0 @@
-# -*- coding: utf-8 -*-
-# Author : Charley
-# Python : 3.10.8
-# Date   : 2025/8/25 14:38
-import inspect
-from datetime import datetime
-import requests
-import user_agent
-from loguru import logger
-from parsel import Selector
-from tenacity import retry, stop_after_attempt, wait_fixed
-from mysql_pool import MySQLConnectionPool
-
-crawler_language = "繁中"
-
-headers = {
-    # "referer": "https://asia.pokemon-card.com/tw/card-search/list/",
-    "user-agent": user_agent.generate_user_agent()
-}
-
-logger.remove()
-logger.add("./logs/fan_{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
-           format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
-           level="DEBUG", retention="7 day")
-
-"""
-expansion_series -> 
-"""
-
-
-def after_log(retry_state):
-    """
-    retry 回调
-    :param retry_state: RetryCallState 对象
-    """
-    # 检查 args 是否存在且不为空
-    if retry_state.args and len(retry_state.args) > 0:
-        log = retry_state.args[0]  # 获取传入的 logger
-    else:
-        log = logger  # 使用全局 logger
-
-    if retry_state.outcome.failed:
-        log.warning(
-            f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
-    else:
-        log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
-
-
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
-def get_proxys(log):
-    """
-    获取代理
-    :return: 代理
-    """
-    tunnel = "x371.kdltps.com:15818"
-    kdl_username = "t13753103189895"
-    kdl_password = "o0yefv6z"
-    try:
-        proxies = {
-            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
-            "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
-        }
-        return proxies
-    except Exception as e:
-        log.error(f"Error getting proxy: {e}")
-        raise e
-
-
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
-def get_category_list(log, sql_pool):
-    log.debug(f'Request get_category_list.........................')
-    url = 'https://asia.pokemon-card.com/tw/card-search/'
-    page = 1
-    max_page = 100
-    while page <= max_page:
-        params = {
-            # "pageNo": "2"
-            "pageNo": page
-        }
-        response = requests.get(url, headers=headers, params=params, timeout=10)
-        response.raise_for_status()
-        selector = Selector(response.text)
-        tag_li_list = selector.xpath('//ul[@class="expansionList"]/li')
-
-        info_list = []
-        for tag_li in tag_li_list:
-            expansionLink = tag_li.xpath('./a/@href').get()
-            expansion_link = f'https://asia.pokemon-card.com{expansionLink}' if expansionLink else None
-            expansion_img = tag_li.xpath('./a//img/@src').get()
-            expansion_series = tag_li.xpath('./a//div[@class="seriesBlock"]/span/text()').get()
-            expansion_title = tag_li.xpath('./a//div[@class="titleBlock"]/h3/text()').get()
-            expansion_title = expansion_title.strip() if expansion_title else None
-
-            expansion_release_time = tag_li.xpath('./a//div[@class="titleBlock"]/time/@datetime').get()
-
-            data_dict = {
-                "expansion_link": expansion_link,
-                "expansion_img": expansion_img,
-                "expansion_series": expansion_series,
-                "expansion_title": expansion_title,
-                "expansion_release_time": expansion_release_time,
-                "crawler_language": crawler_language
-            }
-            # print(data_dict)
-            info_list.append(data_dict)
-
-        if info_list:
-            sql_pool.insert_many(table="pokemon_fanz_category_copy1", data_list=info_list, ignore=True)
-
-        if not tag_li_list:
-            log.debug(f'not tag_li_list!!! page: {page}!!!!!!!!!!')
-            break
-
-        if len(tag_li_list) < 20:
-            log.debug(
-                f'--------------- page {page} has {len(tag_li_list)} items, [len(tag_li_list) < 20] ->->-> break ---------------')
-            break
-
-        page += 1
-
-
-# -----------------------------------------------------------------------------------------------------------------------
-
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
-def get_list_single_page(log, sql_pool, page, cate_tuple):
-    # expansion_link,expansion_series,expansion_title,expansion_release_time
-    url = cate_tuple[0]
-    expansion_series = cate_tuple[1]
-    expansion_title = cate_tuple[2]
-    expansion_release_time = cate_tuple[3]
-    # expansion_release_time 为'03-28-2025'格式  转换为正常的 年月日
-    date_obj = datetime.strptime(expansion_release_time, '%m-%d-%Y')
-    expansion_release_time = date_obj.strftime('%Y-%m-%d')
-    log.debug(f'Request get_list_single_page for page: {page}')
-    # url = "https://asia.pokemon-card.com/tw/card-search/list/"
-    params = {
-        # "pageNo": "2"
-        "pageNo": page
-    }
-    # response = requests.get(url, headers=headers, params=params, timeout=10, proxies=get_proxys(log))
-    response = requests.get(url, headers=headers, params=params, timeout=10)
-    # print(response.text)
-    response.raise_for_status()
-
-    selector = Selector(response.text)
-    tag_li_list = selector.xpath('//*[@id="searchForm"]//ul/li')
-
-    info_list = []
-    for tag_li in tag_li_list:
-        detail_url_str = tag_li.xpath('./a/@href').get()
-        card_id = detail_url_str.split('/')[-2]
-        log.debug(f'card_id: {card_id}')
-        detail_url = f"https://asia.pokemon-card.com{detail_url_str}"
-
-        img = tag_li.xpath('.//img/@src').get()
-        if not img:
-            img = tag_li.xpath('.//img/@data-original').get()
-
-        data_dict = {
-            "card_id": card_id,
-            "major_category_name": expansion_series,
-            "pg_label": expansion_title,
-            "sales_date": expansion_release_time,
-            "detail_url": detail_url,
-            "img": img,
-            "crawler_language": crawler_language
-        }
-        # print(data_dict)
-        info_list.append(data_dict)
-
-    if info_list:
-        sql_pool.insert_many(table="pokemon_card_record_copy1", data_list=info_list, ignore=True)
-
-    return len(tag_li_list)
-
-
-def get_data_list(log, sql_pool, cate_tuple):
-    page = 1
-    max_page = 600
-    while page <= max_page:
-        try:
-            log.debug(
-                f'--------------- {inspect.currentframe().f_code.co_name}, page {page}, start ---------------')
-            len_items = get_list_single_page(log, sql_pool, page, cate_tuple)
-        except Exception as e:
-            log.error(
-                f"{inspect.currentframe().f_code.co_name} Request get_list_single_page for page:{page}, {e}")
-            len_items = 0
-
-        if len_items < 20:
-            log.debug(f'--------------- page {page} has {len_items} items, break ---------------')
-            break
-
-        if page > 50:
-            log.debug(f'--------------- page {page} has {len_items} items, [page > 50] ->->-> break ---------------')
-            break
-
-        page += 1
-
-
-# ----------------------------------------------------------------------------------------------------------------------
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
-def get_details(log, sql_id_detail_url: tuple, sql_pool):
-    log.debug(f'Request get_details for sql_id_detail_url: {sql_id_detail_url}')
-    # url = "https://asia.pokemon-card.com/tw/card-search/detail/13958/"
-    url = sql_id_detail_url[1]
-    # response = requests.get(url, headers=headers, timeout=10, proxies=get_proxys(log))
-    response = requests.get(url, headers=headers, timeout=10)
-    # print(response.text)
-    response.raise_for_status()
-
-    selector = Selector(response.text)
-    card_name = selector.xpath('//div[@class="wrapper"]/header/h1/text()').getall()
-    card_name = ''.join(card_name) if card_name else None
-    card_name = card_name.strip() if card_name else None
-
-    card_no = selector.xpath('//div[@class="wrapper"]//span[@class="collectorNumber"]/text()').get()
-    card_no = card_no.strip() if card_no else None
-    data_dict = {
-        "card_name": card_name,
-        "card_no": card_no
-    }
-    # print(data_dict)
-
-    sql_pool.update_one_or_dict(
-        table="pokemon_card_record_copy1",
-        data=data_dict,
-        condition={"id": sql_id_detail_url[0]}
-    )
-
-
-@retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
-def fz_pokemon_main(log):
-    """
-    主函数
-    """
-    log.info(f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务.............................................')
-
-    # 配置 MySQL 连接池
-    sql_pool = MySQLConnectionPool(log=log)
-    if not sql_pool.check_pool_health():
-        log.error("数据库连接池异常")
-        raise RuntimeError("数据库连接池异常")
-
-    try:
-        # 获取分类列表
-        # log.debug(".......... 获取分类列表 ..........")
-        # try:
-        #     get_category_list(logger, sql_pool)
-        # except Exception as e:
-        #     log.error(f"{inspect.currentframe().f_code.co_name} Request get_category_list error: {e}")
-
-        # 获取每个分类下的  产品列表
-        sql_cate_list = sql_pool.select_all(
-            f"SELECT expansion_link,expansion_series,expansion_title,expansion_release_time FROM pokemon_fanz_category WHERE crawler_language='{crawler_language}'")
-        # sql_cate_list = [x[0] for x in sql_cate_list]
-        for cate_tuple in sql_cate_list:
-            try:
-                # 获取商品列表 https://asia.pokemon-card.com/tw/card-search/list/?expansionCodes=M1L
-                log.debug(f'Request get_data_list for cate: {cate_tuple}')
-                get_data_list(logger, sql_pool, cate_tuple)
-            except Exception as e:
-                log.error(f"{inspect.currentframe().f_code.co_name} Request get_data_list error: {e}")
-
-        # 获取商品详情
-        log.debug(f"........... 获取商品详情 ..........")
-        sql_ietm_id_list = sql_pool.select_all(
-            f"SELECT id, detail_url FROM pokemon_card_record WHERE card_name IS NULL AND crawler_language='{crawler_language}'")
-        for item_id in sql_ietm_id_list:
-            try:
-                get_details(log, item_id, sql_pool)
-            except Exception as e:
-                log.error(f"Request get_details error: {e}")
-
-    except Exception as e:
-        log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
-    finally:
-        log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
-
-
-if __name__ == '__main__':
-    # get_list_single_page(logger, None, 1, 'https://asia.pokemon-card.com/tw/card-search/list/?expansionCodes=M1L')
-    # get_details(logger, (None, None), None)
-    # get_category_list(logger)
-    # '超級進化', '特典卡 超級進化', '08-07-2025')
-    fz_pokemon_main(logger)

+ 8 - 2
pokemon_tcg_spider/jian_pokemon_card_spider.py

@@ -2,10 +2,10 @@
 # Author : Charley
 # Python : 3.10.8
 # Date   : 2025/8/26 10:47
+import datetime
 import inspect
 import random
 import time
-
 import requests
 import user_agent
 from loguru import logger
@@ -121,6 +121,11 @@ def parse_parent_list(log, list_data, sql_pool):
         parent_id = item.get("id")
         expansion_img = item.get("imageUrl")
         expansion_series = item.get("name")
+
+        # 20260327 增加系列过滤 '周边'
+        if "周边" in expansion_series:
+            log.debug(f"{inspect.currentframe().f_code.co_name}过滤系列: {expansion_series}")
+            continue
         # data_dict = {
         #     "parent_id": parent_id,
         #     "expansion_series": expansion_series,
@@ -625,8 +630,9 @@ def jz_pokemon_main(log):
         #     par = {"banCardFlag": "0", "commodityIds": "279", "commoditySelectedList": [
         #         {"id": "279", "commodityName": "收集啦151 惊", "commodityCode": "151C3", "salesDate": "2025-07-18"}],
         #            "pageNum": str(page), "pageSize": "50"}
+        # 2026-03-27 18:11:00  gmt_create_time字段 查询今天的数据
         sql_ietm_id_list = sql_pool.select_all(
-            f"SELECT DISTINCT child_id,child_name,commodity_code,sales_date,expansion_series FROM pokemon_jianz_category WHERE crawler_language='{crawler_language}'")
+            f"SELECT DISTINCT child_id,child_name,commodity_code,sales_date,expansion_series FROM pokemon_jianz_category WHERE crawler_language='{crawler_language}' AND gmt_create_time >= '{datetime.datetime.now().strftime('%Y-%m-%d 00:00:00')}'")
         # sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
         log.debug(f"获取商品详情长度为: {len(sql_ietm_id_list)}")
         for item_tuple in sql_ietm_id_list:

+ 10 - 6
pokemon_tcg_spider/jp_pokemon_card_spider.py

@@ -172,6 +172,10 @@ class JPPokemonCardSpider:
         for card in card_list:
             card_id = card.get("cardID")
             card_name = card.get("cardNameViewText")
+            if '<span' in card_name:
+                card_name = card_name.replace('<span class="pcg pcg-prismstar"></span>', '').replace(
+                    '<span class="pcg pcg-megamark"></span>', '')
+
             card_thumb = card.get("cardThumbFile")
             card_thumb = f'{self.BASE_URL}{card_thumb}'
             data_dict = {
@@ -292,12 +296,12 @@ class JPPokemonCardSpider:
             except Exception as e:
                 self.log.error(f"Request get_classification_list error: {e}")
 
-            # # 2. 获取商品列表  所有 去重 补漏
-            # self.log.debug(f"........... 获取商品列表  所有 去重 ..........")
-            # try:
-            #     self.get_pokemon_card_list(sql_pool=sql_pool)
-            # except Exception as e:
-            #     self.log.error(f"Request get_pokemon_card_list error: {e}")
+            # 2. 获取商品列表  所有 去重
+            self.log.debug(f"........... 获取商品列表  所有 去重 ..........")
+            try:
+                self.get_pokemon_card_list(sql_pool=sql_pool)
+            except Exception as e:
+                self.log.error(f"Request get_pokemon_card_list error: {e}")
 
             # 获取商品详情
             self.log.debug(f"........... 获取商品详情 ..........")

+ 0 - 339
pokemon_tcg_spider/jp_pokemon_card_spider111.py

@@ -1,339 +0,0 @@
-# -*- coding: utf-8 -*-
-# Author : Charley
-# Python : 3.10.8
-# Date   : 2025/8/19 15:58
-import re
-import json
-import inspect
-import requests
-import user_agent
-from loguru import logger
-from parsel import Selector
-from mysql_pool import MySQLConnectionPool
-from tenacity import retry, stop_after_attempt, wait_fixed
-
-# logger.remove()
-# logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
-#            format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
-#            level="DEBUG", retention="7 day")
-
-crawler_language = "jp"
-
-
-class JPPokemonCardSpider:
-    # 类变量
-    BASE_URL = "https://www.pokemon-card.com"
-
-    def __init__(self, log=None):
-        self.log = log or logger
-        self.headers = {
-            "accept": "application/json, text/javascript, */*; q=0.01",
-            "user-agent": user_agent.generate_user_agent()
-        }
-
-    @staticmethod
-    def _after_log(retry_state):
-        """
-        retry 回调 - 静态方法
-        :param retry_state: RetryCallState 对象
-        """
-        # 检查 args 是否存在且不为空
-        if retry_state.args and len(retry_state.args) > 0:
-            log = retry_state.args[0]  # 获取传入的 logger
-        else:
-            log = logger  # 使用全局 logger
-
-        if retry_state.outcome.failed:
-            log.warning(
-                f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
-        else:
-            log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
-
-    @staticmethod
-    def _parse_html_json(html_content, log):
-        """
-        解析网页源码, 获取json数据 - 静态方法
-        :param html_content: 网页源码
-        :param log: logger对象
-        :return: json数据
-        """
-        log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-        # 查找PTC.master.uiData的开始位置
-        start_marker = 'PTC.master.uiData = '
-        start_pos = html_content.find(start_marker)
-
-        if start_pos == -1:
-            log.debug("PTC.master.uiData not found")
-            return None
-
-        # 从开始标记后的位置查找第一个 {
-        start_pos = html_content.find('{', start_pos + len(start_marker))
-
-        if start_pos == -1:
-            log.debug("Opening brace not found")
-            return None
-
-        # 查找匹配的大括号
-        brace_count = 1
-        pos = start_pos + 1
-
-        while pos < len(html_content) and brace_count > 0:
-            if html_content[pos] == '{':
-                brace_count += 1
-            elif html_content[pos] == '}':
-                brace_count -= 1
-            pos += 1
-
-        if brace_count == 0:
-            # 提取完整的JavaScript对象
-            js_object_str = html_content[start_pos:pos]
-            # 转换为有效的JSON
-            try:
-                # 添加引号到键,移除注释等
-                json_str = js_object_str
-
-                # 为键添加引号
-                json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1 "\2":', json_str)
-
-                # 移除注释
-                json_str = re.sub(r'//.*?(\n|$)', r'\1', json_str)
-                json_str = re.sub(r'/\*.*?\*/', '', json_str, flags=re.DOTALL)
-
-                # 移除尾随逗号
-                json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
-
-                ui_data = json.loads(json_str)
-                return ui_data
-
-            except json.JSONDecodeError as e:
-                log.error(f"{inspect.currentframe().f_code.co_name}Failed to decode JSON: {e}")
-        else:
-            log.warning(f"{inspect.currentframe().f_code.co_name} Could not find complete JavaScript object")
-
-        return None
-
-    @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=_after_log)
-    def get_classification_list(self, sql_pool):
-        """
-        获取分类列表
-        :param sql_pool: MySQL连接池对象
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-        url = f"{self.BASE_URL}/card-search/index.php"
-        params = {
-            "keyword": "",
-            "se_ta": "",
-            "regulation_sidebar_form": "all",
-            "pg": "",
-            "illust": "",
-            "sm_and_keyword": "true"
-        }
-        response = requests.get(url, headers=self.headers, params=params, timeout=10)
-        json_ui_data = self._parse_html_json(response.text, self.log)
-        pg_list = json_ui_data.get("pg", {}).get("list", [])
-        if pg_list:
-            for pg in pg_list[1:]:
-                pg_value = pg.get("value")
-                pg_label = pg.get("label")
-                self.log.info(f"pg_list -> pg_value:{pg_value}, pg_label: {pg_label}")
-                self.get_pokemon_card_list(pg_value, pg_label, sql_pool)
-        else:
-            self.log.debug(
-                f"{inspect.currentframe().f_code.co_name} NOt found pg_list !!! get_classification_list end.....................")
-
-    @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=_after_log)
-    def _get_pokemon_card_single_page(self, pg_value, page=1):
-        """
-        获取指定分类和页码的卡片列表
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-        url = f"{self.BASE_URL}/card-search/resultAPI.php"
-        params = {
-            "keyword": "",
-            "se_ta": "",
-            "regulation_sidebar_form": "all",
-            "pg": pg_value,
-            "illust": "",
-            "sm_and_keyword": "true",
-            "page": str(page)
-        }
-        response = requests.get(url, headers=self.headers, params=params, timeout=10)
-        response.raise_for_status()
-
-        resp_json = response.json()
-        return resp_json
-
-    def _parse_pokemon_card_list(self, card_list, pg_value, pg_label, sql_pool):
-        """
-        解析卡片列表,获取卡片信息
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-        card_info_list = []
-        data_list = []
-        for card in card_list:
-            card_id = card.get("cardID")
-
-            data_dict = {"pg_value": pg_value,
-                         "pg_label": pg_label}
-            data_list.append(data_dict)
-
-            condition_dict = {
-                "card_id": card_id,
-                "crawler_language": crawler_language
-            }
-            card_info_list.append(condition_dict)
-
-        if card_info_list:
-            sql_pool.update_many(table="pokemon_card_record", data_list=card_info_list, condition_list=card_info_list)
-
-    def get_pokemon_card_list(self, pg_value='', pg_label='', sql_pool=None):
-        """
-        获取指定分类下的所有卡片列表  翻页
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-
-        page = 1
-        max_page = 1
-
-        while page <= max_page:
-            self.log.debug(f"正在获取第 {page} 页数据, pg_label: {pg_label} .........")
-
-            page_data = self._get_pokemon_card_single_page(pg_value, page)
-
-            if page_data.get("result") == 1:
-                # 更新max_page(仅在第一页时需要更新)
-                if page == 1:
-                    max_page = page_data.get("maxPage", 1)
-                    self.log.info(f"分类 {pg_label} 共有 {max_page} 页数据")
-
-                cardList = page_data.get("cardList", [])
-                if not cardList:
-                    self.log.warning(f"{inspect.currentframe().f_code.co_name} NOt found cardList !!!")
-                    break
-
-                try:
-                    self._parse_pokemon_card_list(cardList, pg_value, pg_label, sql_pool)
-                except Exception as e:
-                    self.log.error(f"{inspect.currentframe().f_code.co_name} parse_pokemon_card_list error: {e}")
-
-                if len(cardList) < 39:
-                    self.log.debug(f"{inspect.currentframe().f_code.co_name} 获取的卡片数量小于39 !!! 停止翻页")
-                    break
-
-                if page >= max_page:
-                    self.log.debug(
-                        f"{inspect.currentframe().f_code.co_name} -> page: {page}, max_page: {max_page}, 停止翻页")
-                    break
-
-                if page >= 10:
-                    self.log.debug(
-                        f"{inspect.currentframe().f_code.co_name} -> page: {page}, page >= 10, 停止翻页.......")
-                    break
-
-                page += 1
-            else:
-                self.log.warning(f"获取第 {page} 页数据失败: {page_data.get('errMsg')}")
-                break
-
-    def get_details(self, item_id, sql_pool):
-        """
-        获取商品详情
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start, item_id: {item_id}.....................")
-        url = f'{self.BASE_URL}/card-search/details.php/card/{item_id}'
-        response = requests.get(url, headers=self.headers, timeout=10)
-        response.raise_for_status()
-
-        selector = Selector(response.text)
-        card_no_list = selector.xpath('//div[@class="subtext Text-fjalla"]/text()').getall()
-        card_no = ''.join(card_no_list)
-        card_no = card_no.strip().replace('\xa0', '') if card_no else None
-
-        tag_ic_rare = selector.xpath(
-            '//div[@class="subtext Text-fjalla"]/img[not(contains(@class, "img-regulation"))]/@src').get()
-
-        ic_rare_sp = tag_ic_rare.split('/')[-1].split('.')[0] if tag_ic_rare else None
-        if ic_rare_sp and '_' in ic_rare_sp:
-            ic_rare = ic_rare_sp.split('_')[-1]
-        else:
-            ic_rare = ic_rare_sp
-
-        data_dict = {
-            "card_no": card_no,
-            "rarity": ic_rare
-        }
-
-        sql_pool.update_one_or_dict(
-            table="pokemon_card_record",
-            data=data_dict,
-            condition={"card_id": item_id}
-        )
-
-    @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=_after_log)
-    def run(self):
-        """
-        主函数
-        """
-        self.log.info(
-            f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
-
-        # 配置 MySQL 连接池
-        sql_pool = MySQLConnectionPool(log=self.log)
-        if not sql_pool.check_pool_health():
-            self.log.error("数据库连接池异常")
-            raise RuntimeError("数据库连接池异常")
-
-        try:
-            # 1. 获取已售出商品列表  按系列获取  先获取这个!!!
-            self.log.debug(f"........... 开始获取已售出商品列表  按系列获取 ..........")
-            try:
-                self.get_classification_list(sql_pool)
-            except Exception as e:
-                self.log.error(f"Request get_classification_list error: {e}")
-
-            # # 2. 获取商品列表  所有 去重
-            # self.log.debug(f"........... 获取商品列表  所有 去重 ..........")
-            # try:
-            #     self.get_pokemon_card_list(sql_pool=sql_pool)
-            # except Exception as e:
-            #     self.log.error(f"Request get_pokemon_card_list error: {e}")
-
-            # 获取商品详情
-            # self.log.debug(f"........... 获取商品详情 ..........")
-            # sql_ietm_id_list = sql_pool.select_all(
-            #     f"SELECT card_id FROM pokemon_card_record WHERE card_no IS NULL AND crawler_language='{crawler_language}'")
-            # sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
-            # for item_id in sql_ietm_id_list:
-            #     try:
-            #         self.get_details(item_id, sql_pool)
-            #     except Exception as e:
-            #         self.log.error(f"Request get_details error: {e}")
-
-        except Exception as e:
-            self.log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
-        finally:
-            self.log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
-
-
-# def schedule_task():
-#     """
-#     爬虫模块 定时任务 的启动文件
-#     """
-#     # 创建爬虫实例
-#     spider = JPPokemonCardSpider()
-#
-#     # 立即运行一次任务
-#     # spider.run()
-#
-#     # 设置定时任务
-#     schedule.every().day.at("01:06").do(spider.run)
-#
-#     while True:
-#         schedule.run_pending()
-#         time.sleep(1)
-
-
-if __name__ == '__main__':
-    # schedule_task()
-    spider = JPPokemonCardSpider()
-    spider.run()

+ 0 - 324
pokemon_tcg_spider/jp_pokemon_card_spider222.py

@@ -1,324 +0,0 @@
-# -*- coding: utf-8 -*-
-# Author : Charley
-# Python : 3.10.8
-# Date   : 2025/8/19 15:58
-import re
-import json
-import inspect
-import requests
-import user_agent
-from loguru import logger
-from parsel import Selector
-from mysql_pool import MySQLConnectionPool
-from tenacity import retry, stop_after_attempt, wait_fixed
-
-# logger.remove()
-# logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
-#            format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
-#            level="DEBUG", retention="7 day")
-
-crawler_language = "jp"
-pokemon_products = {
-    "M-P": "ポケモンカードゲーム MEGAプロモカード",
-    "SV-P": "ポケモンカードゲームスカーレット&バイオレット プロモカード",
-    "950": "ハイクラスパック 「MEGAドリームex」",
-    "949": "拡張パック「インフェルノX」",
-    "947": "スターターセットMEGA メガゲンガーex",
-    "948": "スターターセットMEGA メガディアンシーex",
-    "946": "プレミアムトレーナーボックスMEGA",
-    "944": "拡張パック「メガブレイブ」",
-    "945": "拡張パック「メガシンフォニア」",
-    "942": "拡張パック「ブラックボルト」",
-    "943": "拡張パック「ホワイトフレア」",
-    "941": "拡張パック「ロケット団の栄光」",
-    "940": "強化拡張パック「熱風のアリーナ」",
-    "939": "スターターセットex ダイゴのダンバル&メタグロスex",
-    "938": "スターターセットex マリィのモルペコ&オーロンゲex",
-    "935": "拡張パック「バトルパートナーズ」",
-    "936": "デッキビルドBOX「バトルパートナーズ」",
-    "934": "ハイクラスパック「テラスタルフェスex」",
-    "925": "スタートデッキGenerations ピカチュウex・カビゴンex",
-    "926": "スタートデッキGenerations ルギアex・バンギラスex",
-    "927": "スタートデッキGenerations カイオーガex・バシャーモex",
-    "928": "スタートデッキGenerations ディアルガex・ルカリオex",
-    "929": "スタートデッキGenerations レシラムex・モロバレルex",
-    "930": "スタートデッキGenerations ゼルネアスex・オンバーンex",
-    "931": "スタートデッキGenerations カプ・コケコex・ミミッキュex",
-    "932": "スタートデッキGenerations ザシアンex・マホイップex",
-    "933": "スタートデッキGenerations コライドンex・パルデアドオーex",
-    "923": "拡張パック「超電ブレイカー」",
-    "922": "強化拡張パック「楽園ドラゴーナ」",
-    "921": "スターターセット テラスタイプ:ステラ ニンフィアex",
-    "920": "スターターセット テラスタイプ:ステラ ソウブレイズex",
-    "918": "拡張パック「ステラミラクル」",
-    "919": "デッキビルドBOX ステラミラクル",
-    "917": "強化拡張パック「ナイトワンダラー」",
-    "10917": "スペシャルジャンボカードセット オーガポン",
-    "915": "バトルマスターデッキテラスタルリザードンex",
-    "916": "バトルマスターデッキパオジアンex",
-    "914": "拡張パック「変幻の仮面」",
-    "913": "強化拡張パック「クリムゾンヘイズ」",
-    "911": "バトルアカデミー",
-    "912": "いつでもどこでも バトルアカデミー",
-    "906": "拡張パック「ワイルドフォース」",
-    "907": "拡張パック「サイバージャッジ」",
-    "908": "スターターデッキ&ビルドセット「古代のコライドンex」",
-    "909": "スターターデッキ&ビルドセット「未来のミライドンex」",
-    "905": "ハイクラスパック「シャイニートレジャーex」",
-    "904": "スペシャルデッキセットex フシギバナ・リザードン・カメックス",
-    "901": "拡張パック「古代の咆哮」",
-    "902": "拡張パック「未来の一閃」",
-    "897": "強化拡張パック「レイジングサーフ」",
-    "898": "スターターセット テラスタル ミュウツーex",
-    "899": "スターターセット テラスタル ラウドボーンex",
-    "894": "拡張パック「黒炎の支配者」",
-    "895": "デッキビルドBOX 黒炎の支配者",
-    "896": "ポケモンワールドチャンピオンシップス2023横浜 記念デッキ「ピカチュウ」",
-    "884": "exスタートデッキ 草 ジュナイパー",
-    "885": "exスタートデッキ 炎 ビクティニ",
-    "886": "exスタートデッキ 水 ゲッコウガ",
-    "887": "exスタートデッキ 雷 ミライドン",
-    "888": "exスタートデッキ 超 ピクシー",
-    "889": "exスタートデッキ 闘 コライドン",
-    "890": "exスタートデッキ 悪 ヘルガー",
-    "891": "exスタートデッキ 鋼 メルメタル",
-    "892": "exスタートデッキ テラスタル カイリュー",
-    "893": "exスタートデッキ テラスタル ヨクバリス",
-    "882": "強化拡張パック「ポケモンカード151(イチゴーイチ)」",
-    "881": "exスペシャルセット",
-    "879": "拡張パック「スノーハザード」",
-    "880": "拡張パック「クレイバースト」",
-    "878": "スターターセットex ピカチュウex&パーモット",
-    "877": "強化拡張パック「トリプレットビート」",
-    "870": "拡張パック「スカーレットex」",
-    "871": "拡張パック「バイオレットex」",
-    "872": "スターターセットex ニャオハ&ルカリオex",
-    "873": "スターターセットex ホゲータ&デンリュウex",
-    "874": "スターターセットex クワッス&ミミッキュex",
-    "875": "プレミアムトレーナーボックスex"
-}
-
-
-class JPPokemonCardSpider:
-    # 类变量
-    BASE_URL = "https://www.pokemon-card.com"
-
-    def __init__(self, log=None):
-        self.log = log or logger
-        self.headers = {
-            "accept": "application/json, text/javascript, */*; q=0.01",
-            "user-agent": user_agent.generate_user_agent()
-        }
-
-    @staticmethod
-    def _after_log(retry_state):
-        """
-        retry 回调 - 静态方法
-        :param retry_state: RetryCallState 对象
-        """
-        # 检查 args 是否存在且不为空
-        if retry_state.args and len(retry_state.args) > 0:
-            log = retry_state.args[0]  # 获取传入的 logger
-        else:
-            log = logger  # 使用全局 logger
-
-        if retry_state.outcome.failed:
-            log.warning(
-                f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
-        else:
-            log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
-
-
-    @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=_after_log)
-    def _get_pokemon_card_single_page(self, pg_value, page=1):
-        """
-        获取指定分类和页码的卡片列表
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-        url = f"{self.BASE_URL}/card-search/resultAPI.php"
-        params = {
-            "keyword": "",
-            "se_ta": "",
-            "regulation_sidebar_form": "all",
-            "pg": pg_value,
-            "illust": "",
-            "sm_and_keyword": "true",
-            "page": str(page)
-        }
-        response = requests.get(url, headers=self.headers, params=params, timeout=10)
-        response.raise_for_status()
-
-        resp_json = response.json()
-        return resp_json
-
-    def _parse_pokemon_card_list(self, card_list, pg_value, pg_label, sql_pool):
-        """
-        解析卡片列表,获取卡片信息
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-        card_info_list = []
-        for card in card_list:
-            card_id = card.get("cardID")
-            card_name = card.get("cardNameViewText")
-            card_thumb = card.get("cardThumbFile")
-            card_thumb = f'{self.BASE_URL}{card_thumb}'
-            data_dict = {
-                "card_id": card_id,
-                "card_name": card_name,
-                "img": card_thumb,
-                "pg_value": pg_value,
-                "pg_label": pg_label,
-                "crawler_language": crawler_language
-            }
-            card_info_list.append(data_dict)
-
-        if card_info_list:
-            sql_pool.insert_many(table="pokemon_card_record_copy1", data_list=card_info_list, ignore=True)
-
-    def get_pokemon_card_list(self, pg_value='', pg_label='', sql_pool=None):
-        """
-        获取指定分类下的所有卡片列表  翻页
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start.....................")
-
-        page = 1
-        max_page = 1
-
-        while page <= max_page:
-            self.log.debug(f"正在获取第 {page} 页数据, pg_label: {pg_label} .........")
-
-            page_data = self._get_pokemon_card_single_page(pg_value, page)
-
-            if page_data.get("result") == 1:
-                # 更新max_page(仅在第一页时需要更新)
-                if page == 1:
-                    max_page = page_data.get("maxPage", 1)
-                    self.log.info(f"分类 {pg_label} 共有 {max_page} 页数据")
-
-                cardList = page_data.get("cardList", [])
-                if not cardList:
-                    self.log.warning(f"{inspect.currentframe().f_code.co_name} NOt found cardList !!!")
-                    break
-
-                try:
-                    self._parse_pokemon_card_list(cardList, pg_value, pg_label, sql_pool)
-                except Exception as e:
-                    self.log.error(f"{inspect.currentframe().f_code.co_name} parse_pokemon_card_list error: {e}")
-
-                if len(cardList) < 39:
-                    self.log.debug(f"{inspect.currentframe().f_code.co_name} 获取的卡片数量小于39 !!! 停止翻页")
-                    break
-
-                if page >= max_page:
-                    self.log.debug(
-                        f"{inspect.currentframe().f_code.co_name} -> page: {page}, max_page: {max_page}, 停止翻页")
-                    break
-
-                if page >= 10:
-                    self.log.debug(
-                        f"{inspect.currentframe().f_code.co_name} -> page: {page}, page >= 10, 停止翻页.......")
-                    break
-
-                page += 1
-            else:
-                self.log.warning(f"获取第 {page} 页数据失败: {page_data.get('errMsg')}")
-                break
-
-    def get_details(self, item_id, sql_pool):
-        """
-        获取商品详情
-        """
-        self.log.debug(f"{inspect.currentframe().f_code.co_name} start, item_id: {item_id}.....................")
-        url = f'{self.BASE_URL}/card-search/details.php/card/{item_id}'
-        response = requests.get(url, headers=self.headers, timeout=10)
-        response.raise_for_status()
-
-        selector = Selector(response.text)
-        card_no_list = selector.xpath('//div[@class="subtext Text-fjalla"]/text()').getall()
-        card_no = ''.join(card_no_list)
-        card_no = card_no.strip().replace('\xa0', '') if card_no else None
-
-        tag_ic_rare = selector.xpath(
-            '//div[@class="subtext Text-fjalla"]/img[not(contains(@class, "img-regulation"))]/@src').get()
-
-        ic_rare_sp = tag_ic_rare.split('/')[-1].split('.')[0] if tag_ic_rare else None
-        if ic_rare_sp and '_' in ic_rare_sp:
-            ic_rare = ic_rare_sp.split('_')[-1]
-        else:
-            ic_rare = ic_rare_sp
-
-        data_dict = {
-            "card_no": card_no,
-            "rarity": ic_rare
-        }
-
-        sql_pool.update_one_or_dict(
-            table="pokemon_card_record_copy1",
-            data=data_dict,
-            condition={"card_id": item_id}
-        )
-
-    @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=_after_log)
-    def run(self):
-        """
-        主函数
-        """
-        self.log.info(
-            f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
-
-        # 配置 MySQL 连接池
-        sql_pool = MySQLConnectionPool(log=self.log)
-        if not sql_pool.check_pool_health():
-            self.log.error("数据库连接池异常")
-            raise RuntimeError("数据库连接池异常")
-
-        try:
-            self.log.debug(f"........... 开始获取已售出商品列表  按系列获取 ..........")
-            for pg_value, pg_label in pokemon_products.items():
-                # print(pg_value, pg_label)
-                try:
-                    self.get_pokemon_card_list(pg_value, pg_label, sql_pool)
-                except Exception as e:
-                    self.log.error(f"Request get_pokemon_card_list error: {e}")
-
-
-
-            # 获取商品详情
-            # self.log.debug(f"........... 获取商品详情 ..........")
-            # sql_ietm_id_list = sql_pool.select_all(
-            #     f"SELECT card_id FROM pokemon_card_record WHERE card_no IS NULL AND crawler_language='{crawler_language}'")
-            # sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
-            # for item_id in sql_ietm_id_list:
-            #     try:
-            #         self.get_details(item_id, sql_pool)
-            #     except Exception as e:
-            #         self.log.error(f"Request get_details error: {e}")
-
-        except Exception as e:
-            self.log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
-        finally:
-            self.log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
-
-
-# def schedule_task():
-#     """
-#     爬虫模块 定时任务 的启动文件
-#     """
-#     # 创建爬虫实例
-#     spider = JPPokemonCardSpider()
-#
-#     # 立即运行一次任务
-#     # spider.run()
-#
-#     # 设置定时任务
-#     schedule.every().day.at("01:06").do(spider.run)
-#
-#     while True:
-#         schedule.run_pending()
-#         time.sleep(1)
-
-
-if __name__ == '__main__':
-    # schedule_task()
-    spider = JPPokemonCardSpider()
-    spider.run()

+ 42 - 37
pokemon_tcg_spider/mysql_pool.py

@@ -66,7 +66,7 @@ class MySQLConnectionPool:
         except Exception as e:
             if commit:
                 conn.rollback()
-            self.log.error(f"Error executing query: {e}, Query: {query}, Args: {args}")
+            self.log.exception(f"Error executing query: {e}, Query: {query}, Args: {args}")
             raise e
 
     def select_one(self, query, args=None):
@@ -172,15 +172,16 @@ class MySQLConnectionPool:
                 # print("插入失败:重复条目", e)
                 return -1  # 返回 -1 表示重复条目被跳过
             else:
-                self.log.error(f"数据库完整性错误: {e}")
+                self.log.exception(f"数据库完整性错误: {e}")
                 # print("插入失败:完整性错误", e)
-                raise e
+                raise
         except Exception as e:
-            self.log.error(f"未知错误: {e}", exc_info=True)
+            # self.log.error(f"未知错误: {str(e)}", exc_info=True)
+            self.log.exception(f"未知错误: {e}")  # 记录完整异常信息
             # print("插入失败:未知错误", e)
-            raise e
+            raise
 
-    def insert_many(self, table=None, data_list=None, query=None, args_list=None, batch_size=500, commit=True, ignore=False):
+    def insert_many(self, table=None, data_list=None, query=None, args_list=None, batch_size=1000, commit=True, ignore=False):
         """
         批量插入(支持字典列表或原始SQL)
         :param table: 表名(字典插入时必需)
@@ -210,41 +211,34 @@ class MySQLConnectionPool:
         total = 0
         for i in range(0, len(args_list), batch_size):
             batch = args_list[i:i + batch_size]
-            conn = None
             try:
-                conn = self.pool.connection()
-                with conn.cursor() as cursor:
-                    cursor.executemany(query, batch)
-                    if commit:
-                        conn.commit()
-                    total += cursor.rowcount
-            except pymysql.Error as e:
-                if conn:
-                    try:
+                with self.pool.connection() as conn:
+                    with conn.cursor() as cursor:
+                        cursor.executemany(query, batch)
                         if commit:
-                            conn.rollback()
-                    except:
-                        pass
+                            conn.commit()
+                        total += cursor.rowcount
+            except pymysql.Error as e:
                 if "Duplicate entry" in str(e):
-                    raise e
+                    # self.log.warning(f"检测到重复条目,开始逐条插入。错误详情: {e}")
+                    raise  e
+                    # rowcount = 0
+                    # for args in batch:
+                    #     try:
+                    #         self.insert_one_or_dict(table=table, data=dict(zip(data_list[0].keys(), args)),
+                    #                                 commit=commit)
+                    #         rowcount += 1
+                    #     except pymysql.err.IntegrityError as e2:
+                    #         if "Duplicate entry" in str(e2):
+                    #             self.log.warning(f"跳过重复条目: {args}")
+                    #         else:
+                    #             self.log.error(f"插入失败: {e2}, 参数: {args}")
+                    # total += rowcount
                 else:
-                    self.log.error(f"数据库错误: {e}")
+                    self.log.exception(f"数据库错误: {e}")
+                    if commit:
+                        conn.rollback()
                     raise e
-            except Exception as e:
-                if conn:
-                    try:
-                        if commit:
-                            conn.rollback()
-                    except:
-                        pass
-                self.log.error(f"数据库错误: {e}")
-                raise e
-            finally:
-                if conn:
-                    try:
-                        conn.close()
-                    except:
-                        pass
                 # 重新抛出异常,供外部捕获
                 # 降级为单条插入
                 # for args in batch:
@@ -257,7 +251,7 @@ class MySQLConnectionPool:
         self.log.info(f"sql insert_many, Table: {table}, Total Rows: {total}")
         return total
 
-    def insert_many_two(self, table=None, data_list=None, query=None, args_list=None, batch_size=500, commit=True):
+    def insert_many_two(self, table=None, data_list=None, query=None, args_list=None, batch_size=1000, commit=True):
         """
         批量插入(支持字典列表或原始SQL)
         :param table: 表名(字典插入时必需)
@@ -567,3 +561,14 @@ class MySQLConnectionPool:
             raise ValueError(f"Invalid SQL identifier: {name}")
         return name
 
+
+if __name__ == '__main__':
+    sql_pool = MySQLConnectionPool()
+    data_dic = {'card_type_id': 111, 'card_type_name': '补充包 继承的意志【OPC-13】', 'card_type_position': 964,
+                'card_id': 5284, 'card_name': '蒙奇·D·路飞', 'card_number': 'OP13-001', 'card_rarity': 'L',
+                'card_img': 'https://source.windoent.com/OnePiecePc/Picture/1757929283612OP13-001.png',
+                'card_life': '4', 'card_attribute': '打', 'card_power': '5000', 'card_attack': '-',
+                'card_color': '红/绿', 'subscript': 4, 'card_features': '超新星/草帽一伙',
+                'card_text_desc': '【咚!!×1】【对方的攻击时】我方处于活跃状态的咚!!不多于5张的场合,可以将我方任意张数的咚!!转为休息状态。每有1张转为休息状态的咚!!,本次战斗中,此领袖或我方最多1张拥有《草帽一伙》特征的角色力量+2000。',
+                'card_offer_type': '补充包 继承的意志【OPC-13】', 'crawler_language': '简中'}
+    sql_pool.insert_one_or_dict(table="one_piece_record", data=data_dic)

+ 15 - 12
pokemon_tcg_spider/start_pokemon_spider.py

@@ -6,8 +6,10 @@ import time
 import schedule
 import threading
 from loguru import logger
-from jp_pokemon_card_spider import JPPokemonCardSpider
-from us_pokemon_card_spider import us_pokemon_main
+
+from jian_pokemon_card_spider import jz_pokemon_main
+from tcg_jp_pokemon_spider import jp_pokemon_main
+from tcg_us_pokemon_spider import us_pokemon_main
 from fan_pokemon_card_spider import fz_pokemon_main
 
 logger.remove()
@@ -36,20 +38,21 @@ def schedule_task():
     change_card_main
     """
     # 立即运行一次任务
-    jp_spider = JPPokemonCardSpider(log=logger)
-    run_threaded(jp_spider.run)
+    # jp_spider = JPPokemonCardSpider(log=logger)
+    # run_threaded(jp_spider.run)
     #
-    run_threaded(us_pokemon_main(log=logger))
-    run_threaded(fz_pokemon_main(log=logger))
+    # run_threaded(us_pokemon_main(log=logger))
+    # run_threaded(fz_pokemon_main(log=logger))
 
     # 设置定时任务
-    schedule.every().day.at("02:01").do(run_threaded, JPPokemonCardSpider(log=logger).run)
-    schedule.every().day.at("01:01").do(run_threaded, us_pokemon_main, log=logger)
+    # schedule.every().day.at("02:01").do(run_threaded, JPPokemonCardSpider(log=logger).run)
+    # schedule.every().day.at("01:01").do(run_threaded, us_pokemon_main, log=logger)
 
-    schedule.every().tuesday.at("02:01").do(run_threaded, fz_pokemon_main, log=logger)
-    #
-    # schedule.every().day.at("03:01").do(run_threaded, hoopi_mall_card_main, log=logger)
-    # schedule.every().day.at("04:01").do(run_threaded, hoopi_mall_group_main, log=logger)
+    # 20260327重启pokemon任务, jp和us的爬虫任务, 改为从tcg网站获取
+    schedule.every().day.at("02:01").do(run_threaded, jp_pokemon_main, log=logger)
+    schedule.every().day.at("01:01").do(run_threaded, us_pokemon_main, log=logger)
+    schedule.every().tuesday.at("03:01").do(run_threaded, fz_pokemon_main, log=logger)
+    schedule.every().tuesday.at("04:01").do(run_threaded, jz_pokemon_main, log=logger)
 
     while True:
         schedule.run_pending()

+ 5 - 1
pokemon_tcg_spider/us_pokemon_card_spider.py

@@ -181,7 +181,11 @@ def get_details(log, sql_id_detail_url: tuple, sql_pool):
 
     selector = Selector(response.text)
     card_name = selector.xpath('//div[@class="card-description"]/div/h1/text()').get()
-    card_name = card_name.strip() if card_name else ''
+    card_name = card_name.strip() if card_name else None
+
+    if not card_name:
+        log.error(f"获取商品详情失败, 商品ID: {sql_id}, 商品详情: {detail_url}")
+        raise Exception("获取商品详情失败")
 
     card_no_rarity = selector.xpath('//div[@class="stats-footer"]/span/text()').get()
     data_dict = {