1 nedēļu atpakaļ · 0e7f3d4842
--- a/wheatland_spider/YamlLoader.py
+++ b/wheatland_spider/YamlLoader.py
@@ -0,0 +1,98 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# Author : Charley
			
 
				+# Python : 3.10.8
			
 
				+# Date   : 2025/12/22 10:44
			
 
				+import os, re
			
 
				+import yaml
			
 
				+
			
 
				+regex = re.compile(r'^\$\{(?P<ENV>[A-Z_\-]+:)?(?P<VAL>[\w.]+)}$')
			
 
				+
			
 
				+
			
 
				+class YamlConfig:
			
 
				+    def __init__(self, config):
			
 
				+        self.config = config
			
 
				+
			
 
				+    def get(self, key: str):
			
 
				+        return YamlConfig(self.config.get(key))
			
 
				+
			
 
				+    def getValueAsString(self, key: str):
			
 
				+        try:
			
 
				+            match = regex.match(self.config[key])
			
 
				+            group = match.groupdict()
			
 
				+            if group['ENV'] is not None:
			
 
				+                env = group['ENV'][:-1]
			
 
				+                return os.getenv(env, group['VAL'])
			
 
				+            return None
			
 
				+        except:
			
 
				+            return self.config[key]
			
 
				+
			
 
				+    def getValueAsInt(self, key: str):
			
 
				+        try:
			
 
				+            match = regex.match(self.config[key])
			
 
				+            group = match.groupdict()
			
 
				+            if group['ENV'] is not None:
			
 
				+                env = group['ENV'][:-1]
			
 
				+                return int(os.getenv(env, group['VAL']))
			
 
				+            return 0
			
 
				+        except:
			
 
				+            return int(self.config[key])
			
 
				+
			
 
				+    def getValueAsBool(self, key: str):
			
 
				+        try:
			
 
				+            match = regex.match(self.config[key])
			
 
				+            group = match.groupdict()
			
 
				+            if group['ENV'] is not None:
			
 
				+                env = group['ENV'][:-1]
			
 
				+                return bool(os.getenv(env, group['VAL']))
			
 
				+            return False
			
 
				+        except:
			
 
				+            return bool(self.config[key])
			
 
				+
			
 
				+
			
 
				+def _resolve_path(path: str) -> str:
			
 
				+    """
			
 
				+    解析 yaml 文件路径，按优先级查找：
			
 
				+      1) 绝对路径或 cwd 下存在 → 直接用（保留旧行为，向后兼容）
			
 
				+      2) 调用方主脚本所在目录 → 兜底，方便打包后从任意 cwd 启动
			
 
				+    :param path: (str) 用户传入的路径，默认 'application.yml'
			
 
				+    :return: (str) 实际可读取的完整路径；找不到则返回原 path 让 open() 抛错
			
 
				+    """
			
 
				+    # 1) 旧行为：cwd 或绝对路径
			
 
				+    if os.path.exists(path):
			
 
				+        return path
			
 
				+
			
 
				+    # 2) 主脚本目录（__main__.__file__）
			
 
				+    try:
			
 
				+        import __main__
			
 
				+        main_file = getattr(__main__, '__file__', None)
			
 
				+        if main_file:
			
 
				+            candidate = os.path.join(os.path.dirname(os.path.abspath(main_file)), path)
			
 
				+            if os.path.exists(candidate):
			
 
				+                return candidate
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+
			
 
				+    return path
			
 
				+
			
 
				+
			
 
				+def readYaml(path: str = 'application.yml', profile: str = None) -> YamlConfig:
			
 
				+    """
			
 
				+    读取 yaml 配置。
			
 
				+    :param path: (str) yaml 文件路径，默认 'application.yml'。
			
 
				+                       优先 cwd / 绝对路径（保留旧行为），找不到再 fallback 到主脚本所在目录。
			
 
				+    :param profile: (str) 可选环境后缀，如 'dev' 会额外加载 'application-dev.yml' 并 update
			
 
				+    :return: (YamlConfig) 配置访问对象
			
 
				+    :raises FileNotFoundError: cwd 和主脚本目录都找不到时抛出
			
 
				+    """
			
 
				+    real_path = _resolve_path(path)
			
 
				+    with open(real_path, encoding='utf-8') as fd:
			
 
				+        conf = yaml.load(fd, Loader=yaml.FullLoader)
			
 
				+
			
 
				+    if profile is not None:
			
 
				+        result = real_path.rsplit('.', 1)
			
 
				+        profiledYaml = f'{result[0]}-{profile}.{result[1]}'
			
 
				+        if os.path.exists(profiledYaml):
			
 
				+            with open(profiledYaml, encoding='utf-8') as fd:
			
 
				+                conf.update(yaml.load(fd, Loader=yaml.FullLoader))
			
 
				+
			
 
				+    return YamlConfig(conf)
			
--- a/wheatland_spider/application.yml
+++ b/wheatland_spider/application.yml
@@ -0,0 +1,6 @@
 
				+mysql:
			
 
				+  host: ${MYSQL_HOST:100.64.0.21}
			
 
				+  port: ${MYSQL_PROT:3306}
			
 
				+  username: ${MYSQL_USERNAME:crawler}
			
 
				+  password: ${MYSQL_PASSWORD:Pass2022}
			
 
				+  db: ${MYSQL_DATABASE:crawler}
			
--- a/wheatland_spider/mysql_pool.py
+++ b/wheatland_spider/mysql_pool.py
@@ -0,0 +1,671 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# Author : Charley
			
 
				+# Python : 3.10.8
			
 
				+# Date   : 2025/3/25 14:14
			
 
				+import re
			
 
				+import pymysql
			
 
				+import YamlLoader
			
 
				+from loguru import logger
			
 
				+from dbutils.pooled_db import PooledDB
			
 
				+
			
 
				+# 获取yaml配置
			
 
				+yaml = YamlLoader.readYaml()
			
 
				+mysqlYaml = yaml.get("mysql")
			
 
				+sql_host = mysqlYaml.getValueAsString("host")
			
 
				+sql_port = mysqlYaml.getValueAsInt("port")
			
 
				+sql_user = mysqlYaml.getValueAsString("username")
			
 
				+sql_password = mysqlYaml.getValueAsString("password")
			
 
				+sql_db = mysqlYaml.getValueAsString("db")
			
 
				+
			
 
				+
			
 
				+class MySQLConnectionPool:
			
 
				+    """
			
 
				+    MySQL连接池
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, mincached=1, maxcached=2, maxconnections=3, log=None):
			
 
				+        """
			
 
				+        初始化连接池
			
 
				+        :param mincached: 初始化时，链接池中至少创建的链接，0表示不创建
			
 
				+        :param maxcached: 池中空闲连接的最大数目（0 或 None 表示池大小不受限制）
			
 
				+        :param maxconnections: 允许的最大连接数（0 或 None 表示任意数量的连接）
			
 
				+        :param log: 自定义日志记录器
			
 
				+        """
			
 
				+        # 使用 loguru 的 logger，如果传入了其他 logger，则使用传入的 logger
			
 
				+        self.log = log or logger
			
 
				+        self.pool = PooledDB(
			
 
				+            creator=pymysql,
			
 
				+            mincached=mincached,
			
 
				+            maxcached=maxcached,
			
 
				+            maxconnections=maxconnections,
			
 
				+            blocking=True,  # 连接池中如果没有可用连接后，是否阻塞等待。True，等待；False，不等待然后报错
			
 
				+            host=sql_host,
			
 
				+            port=sql_port,
			
 
				+            user=sql_user,
			
 
				+            password=sql_password,
			
 
				+            database=sql_db,
			
 
				+            ping=2,  # 每次执行前检查连接有效性，防止使用已断开的连接
			
 
				+            connect_timeout=5,  # 连接超时时间（秒）
			
 
				+            # read_timeout=30,  # 读取超时时间（秒）
			
 
				+            write_timeout=30  # 写入超时时间（秒）
			
 
				+        )
			
 
				+
			
 
				+    # def _execute(self, query, args=None, commit=False):
			
 
				+    #     """
			
 
				+    #     执行SQL
			
 
				+    #     :param query: SQL语句
			
 
				+    #     :param args: SQL参数
			
 
				+    #     :param commit: 是否提交事务
			
 
				+    #     :return: 查询结果
			
 
				+    #     """
			
 
				+    #     try:
			
 
				+    #         with self.pool.connection() as conn:
			
 
				+    #             with conn.cursor() as cursor:
			
 
				+    #                 cursor.execute(query, args)
			
 
				+    #                 if commit:
			
 
				+    #                     conn.commit()
			
 
				+    #                 self.log.debug(f"sql _execute, Query: {query}, Rows: {cursor.rowcount}")
			
 
				+    #                 return cursor
			
 
				+    #     except Exception as e:
			
 
				+    #         if commit and conn:
			
 
				+    #             conn.rollback()
			
 
				+    #         self.log.exception(f"Error executing query: {e}, Query: {query}, Args: {args}")
			
 
				+    #         raise e
			
 
				+
			
 
				+    def _execute(self, query, args=None, commit=False):
			
 
				+        """
			
 
				+        执行SQL（带断连重试）
			
 
				+        :param query: SQL语句
			
 
				+        :param args: SQL参数
			
 
				+        :param commit: 是否提交事务
			
 
				+        :return: 查询结果
			
 
				+        """
			
 
				+        conn = None
			
 
				+        for attempt in range(2):  # 最多重试1次
			
 
				+            try:
			
 
				+                with self.pool.connection() as conn:
			
 
				+                    with conn.cursor() as cursor:
			
 
				+                        cursor.execute(query, args)
			
 
				+                        if commit:
			
 
				+                            conn.commit()
			
 
				+                        self.log.debug(f"sql _execute, Query: {query}, Rows: {cursor.rowcount}")
			
 
				+                        return cursor
			
 
				+            except pymysql.err.InterfaceError as e:
			
 
				+                # 连接已断开，重试一次
			
 
				+                if attempt == 0:
			
 
				+                    self.log.warning(f"数据库连接断开，正在重试... Error: {e}")
			
 
				+                    continue
			
 
				+                self.log.error(f"重试后仍失败: {e}, Query: {query}")
			
 
				+                raise e
			
 
				+            except pymysql.err.IntegrityError:
			
 
				+                # 完整性错误（如重复条目）交由上层处理，避免在此打印完整堆栈污染日志
			
 
				+                if commit and conn:
			
 
				+                    try:
			
 
				+                        conn.rollback()
			
 
				+                    except Exception:
			
 
				+                        pass
			
 
				+                raise
			
 
				+            except Exception as e:
			
 
				+                if commit and conn:
			
 
				+                    try:
			
 
				+                        conn.rollback()
			
 
				+                    except Exception:
			
 
				+                        pass
			
 
				+                self.log.exception(f"Error executing query: {e}, Query: {query}, Args: {args}")
			
 
				+                raise e
			
 
				+
			
 
				+    def select_one(self, query, args=None):
			
 
				+        """
			
 
				+        执行查询，返回单个结果
			
 
				+        :param query: 查询语句
			
 
				+        :param args: 查询参数
			
 
				+        :return: 查询结果
			
 
				+        """
			
 
				+        cursor = self._execute(query, args)
			
 
				+        return cursor.fetchone()
			
 
				+
			
 
				+    def select_all(self, query, args=None):
			
 
				+        """
			
 
				+        执行查询，返回所有结果
			
 
				+        :param query: 查询语句
			
 
				+        :param args: 查询参数
			
 
				+        :return: 查询结果
			
 
				+        """
			
 
				+        cursor = self._execute(query, args)
			
 
				+        return cursor.fetchall()
			
 
				+
			
 
				+    def insert_one(self, query, args):
			
 
				+        """
			
 
				+        执行单条插入语句
			
 
				+        :param query: 插入语句
			
 
				+        :param args: 插入参数
			
 
				+        """
			
 
				+        self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data insert_one 入库中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
			
 
				+        cursor = self._execute(query, args, commit=True)
			
 
				+        return cursor.lastrowid  # 返回插入的ID
			
 
				+
			
 
				+    def insert_all(self, query, args_list):
			
 
				+        """
			
 
				+        执行批量插入语句,如果失败则逐条插入
			
 
				+        :param query: 插入语句
			
 
				+        :param args_list: 插入参数列表
			
 
				+        """
			
 
				+        conn = None
			
 
				+        cursor = None
			
 
				+        try:
			
 
				+            conn = self.pool.connection()
			
 
				+            cursor = conn.cursor()
			
 
				+            cursor.executemany(query, args_list)
			
 
				+            conn.commit()
			
 
				+            self.log.debug(f"sql insert_all, SQL: {query[:100]}..., Rows: {cursor.rowcount}")
			
 
				+            self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data insert_all 入库中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
			
 
				+        except pymysql.err.IntegrityError as e:
			
 
				+            if "Duplicate entry" in str(e):
			
 
				+                conn.rollback()
			
 
				+                self.log.warning(f"批量插入遇到重复,开始逐条插入。错误: {e}")
			
 
				+                rowcount = 0
			
 
				+                for args in args_list:
			
 
				+                    try:
			
 
				+                        self.insert_one(query, args)
			
 
				+                        rowcount += 1
			
 
				+                    except pymysql.err.IntegrityError as e2:
			
 
				+                        if "Duplicate entry" in str(e2):
			
 
				+                            self.log.debug(f"跳过重复条目: {e2}")
			
 
				+                        else:
			
 
				+                            self.log.error(f"插入失败: {e2}")
			
 
				+                    except Exception as e2:
			
 
				+                        self.log.error(f"插入失败: {e2}")
			
 
				+                self.log.info(f"逐条插入完成: {rowcount}/{len(args_list)}条")
			
 
				+            else:
			
 
				+                conn.rollback()
			
 
				+                self.log.exception(f"数据库完整性错误: {e}")
			
 
				+                raise e
			
 
				+        except Exception as e:
			
 
				+            conn.rollback()
			
 
				+            self.log.exception(f"批量插入失败: {e}")
			
 
				+            raise e
			
 
				+        finally:
			
 
				+            if cursor:
			
 
				+                cursor.close()
			
 
				+            if conn:
			
 
				+                conn.close()
			
 
				+
			
 
				+    def insert_one_or_dict(self, table=None, data=None, query=None, args=None, commit=True, ignore=False):
			
 
				+        """
			
 
				+        单条插入（支持字典或原始SQL）
			
 
				+        :param table: 表名（字典插入时必需）
			
 
				+        :param data: 字典数据 {列名: 值}
			
 
				+        :param query: 直接SQL语句（与data二选一）
			
 
				+        :param args: SQL参数（query使用时必需）
			
 
				+        :param commit: 是否自动提交
			
 
				+        :param ignore: 是否使用ignore
			
 
				+        :return: 最后插入ID
			
 
				+        """
			
 
				+        if data is not None:
			
 
				+            if not isinstance(data, dict):
			
 
				+                raise ValueError("Data must be a dictionary")
			
 
				+
			
 
				+            keys = ', '.join([self._safe_identifier(k) for k in data.keys()])
			
 
				+            values = ', '.join(['%s'] * len(data))
			
 
				+
			
 
				+            # 构建 INSERT IGNORE 语句
			
 
				+            ignore_clause = "IGNORE" if ignore else ""
			
 
				+            query = f"INSERT {ignore_clause} INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
			
 
				+            args = tuple(data.values())
			
 
				+        elif query is None:
			
 
				+            raise ValueError("Either data or query must be provided")
			
 
				+
			
 
				+        try:
			
 
				+            cursor = self._execute(query, args, commit)
			
 
				+            self.log.info(f"sql insert_one_or_dict, Table: {table}, Rows: {cursor.rowcount}")
			
 
				+            self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data insert_one_or_dict 入库中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
			
 
				+            return cursor.lastrowid
			
 
				+        except pymysql.err.IntegrityError as e:
			
 
				+            if "Duplicate entry" in str(e):
			
 
				+                # 重复条目用 warning 简短输出，不打印堆栈
			
 
				+                self.log.warning(f"插入跳过-重复条目 Table: {table}, {e.args[1] if len(e.args) > 1 else e}")
			
 
				+                return -1  # 返回 -1 表示重复条目被跳过
			
 
				+            else:
			
 
				+                self.log.error(f"数据库完整性错误 Table: {table}, Error: {e}")
			
 
				+                raise
			
 
				+        except Exception as e:
			
 
				+            self.log.error(f"insert_one_or_dict 失败 Table: {table}, Error: {e}")
			
 
				+            raise
			
 
				+
			
 
				+    def insert_many(self, table=None, data_list=None, query=None, args_list=None, batch_size=1000, commit=True,
			
 
				+                    ignore=False):
			
 
				+        """
			
 
				+        批量插入（支持字典列表或原始SQL）
			
 
				+        :param table: 表名（字典插入时必需）
			
 
				+        :param data_list: 字典列表 [{列名: 值}]
			
 
				+        :param query: 直接SQL语句（与data_list二选一）
			
 
				+        :param args_list: SQL参数列表（query使用时必需）
			
 
				+        :param batch_size: 分批大小
			
 
				+        :param commit: 是否自动提交
			
 
				+        :param ignore: 是否使用ignore
			
 
				+        :return: 影响行数
			
 
				+        """
			
 
				+        if data_list is not None:
			
 
				+            if not data_list or not isinstance(data_list[0], dict):
			
 
				+                raise ValueError("Data_list must be a non-empty list of dictionaries")
			
 
				+
			
 
				+            keys = ', '.join([self._safe_identifier(k) for k in data_list[0].keys()])
			
 
				+            values = ', '.join(['%s'] * len(data_list[0]))
			
 
				+
			
 
				+            # 构建 INSERT IGNORE 语句
			
 
				+            ignore_clause = "IGNORE" if ignore else ""
			
 
				+            query = f"INSERT {ignore_clause} INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
			
 
				+            args_list = [tuple(d.values()) for d in data_list]
			
 
				+        elif query is None:
			
 
				+            raise ValueError("Either data_list or query must be provided")
			
 
				+
			
 
				+        total = 0
			
 
				+        for i in range(0, len(args_list), batch_size):
			
 
				+            batch = args_list[i:i + batch_size]
			
 
				+            try:
			
 
				+                with self.pool.connection() as conn:
			
 
				+                    with conn.cursor() as cursor:
			
 
				+                        cursor.executemany(query, batch)
			
 
				+                        if commit:
			
 
				+                            conn.commit()
			
 
				+                        total += cursor.rowcount
			
 
				+            except pymysql.err.IntegrityError as e:
			
 
				+                # 处理唯一索引冲突
			
 
				+                if "Duplicate entry" in str(e):
			
 
				+                    if ignore:
			
 
				+                        # 如果使用了 INSERT IGNORE,理论上不会进这里,但以防万一
			
 
				+                        self.log.warning(f"批量插入遇到重复条目(ignore模式): {e}")
			
 
				+                    else:
			
 
				+                        # 没有使用 IGNORE,降级为逐条插入
			
 
				+                        self.log.warning(f"批量插入遇到重复条目,开始逐条插入。错误: {e}")
			
 
				+                        if commit:
			
 
				+                            conn.rollback()
			
 
				+                        
			
 
				+                        rowcount = 0
			
 
				+                        for j, args in enumerate(batch):
			
 
				+                            try:
			
 
				+                                if data_list:
			
 
				+                                    # 字典模式
			
 
				+                                    self.insert_one_or_dict(
			
 
				+                                        table=table,
			
 
				+                                        data=dict(zip(data_list[0].keys(), args)),
			
 
				+                                        commit=commit,
			
 
				+                                        ignore=False  # 单条插入时手动捕获重复
			
 
				+                                    )
			
 
				+                                else:
			
 
				+                                    # 原始SQL模式
			
 
				+                                    self.insert_one(query, args)
			
 
				+                                rowcount += 1
			
 
				+                            except pymysql.err.IntegrityError as e2:
			
 
				+                                if "Duplicate entry" in str(e2):
			
 
				+                                    self.log.debug(f"跳过重复条目[{i+j+1}]: {e2}")
			
 
				+                                else:
			
 
				+                                    self.log.error(f"插入失败[{i+j+1}]: {e2}")
			
 
				+                            except Exception as e2:
			
 
				+                                self.log.error(f"插入失败[{i+j+1}]: {e2}")
			
 
				+                        total += rowcount
			
 
				+                        self.log.info(f"批次逐条插入完成: 成功{rowcount}/{len(batch)}条")
			
 
				+                else:
			
 
				+                    # 其他完整性错误
			
 
				+                    self.log.exception(f"数据库完整性错误: {e}")
			
 
				+                    if commit:
			
 
				+                        conn.rollback()
			
 
				+                    raise e
			
 
				+            except Exception as e:
			
 
				+                # 其他数据库错误
			
 
				+                self.log.exception(f"批量插入失败: {e}")
			
 
				+                if commit:
			
 
				+                    conn.rollback()
			
 
				+                raise e
			
 
				+        if table:
			
 
				+            self.log.info(f"sql insert_many, Table: {table}, Total Rows: {total}")
			
 
				+        else:
			
 
				+            self.log.info(f"sql insert_many, Query: {query}, Total Rows: {total}")
			
 
				+        return total
			
 
				+
			
 
				+    def insert_many_two(self, table=None, data_list=None, query=None, args_list=None, batch_size=1000, commit=True,
			
 
				+                        ignore=False):
			
 
				+        """
			
 
				+        批量插入(支持字典列表或原始SQL) - 备用方法
			
 
				+        :param table: 表名(字典插入时必需)
			
 
				+        :param data_list: 字典列表 [{列名: 值}]
			
 
				+        :param query: 直接SQL语句(与data_list二选一)
			
 
				+        :param args_list: SQL参数列表(query使用时必需)
			
 
				+        :param batch_size: 分批大小
			
 
				+        :param commit: 是否自动提交
			
 
				+        :param ignore: 是否使用INSERT IGNORE
			
 
				+        :return: 影响行数
			
 
				+        """
			
 
				+        if data_list is not None:
			
 
				+            if not data_list or not isinstance(data_list[0], dict):
			
 
				+                raise ValueError("Data_list must be a non-empty list of dictionaries")
			
 
				+            keys = ', '.join([self._safe_identifier(k) for k in data_list[0].keys()])
			
 
				+            values = ', '.join(['%s'] * len(data_list[0]))
			
 
				+            ignore_clause = "IGNORE" if ignore else ""
			
 
				+            query = f"INSERT {ignore_clause} INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
			
 
				+            args_list = [tuple(d.values()) for d in data_list]
			
 
				+        elif query is None:
			
 
				+            raise ValueError("Either data_list or query must be provided")
			
 
				+    
			
 
				+        total = 0
			
 
				+        for i in range(0, len(args_list), batch_size):
			
 
				+            batch = args_list[i:i + batch_size]
			
 
				+            try:
			
 
				+                with self.pool.connection() as conn:
			
 
				+                    with conn.cursor() as cursor:
			
 
				+                        cursor.executemany(query, batch)
			
 
				+                        if commit:
			
 
				+                            conn.commit()
			
 
				+                        total += cursor.rowcount
			
 
				+            except pymysql.err.IntegrityError as e:
			
 
				+                if "Duplicate entry" in str(e) and not ignore:
			
 
				+                    self.log.warning(f"批量插入遇到重复,降级为逐条插入: {e}")
			
 
				+                    if commit:
			
 
				+                        conn.rollback()
			
 
				+                    rowcount = 0
			
 
				+                    for args in batch:
			
 
				+                        try:
			
 
				+                            self.insert_one(query, args)
			
 
				+                            rowcount += 1
			
 
				+                        except pymysql.err.IntegrityError as e2:
			
 
				+                            if "Duplicate entry" in str(e2):
			
 
				+                                self.log.debug(f"跳过重复条目: {e2}")
			
 
				+                            else:
			
 
				+                                self.log.error(f"插入失败: {e2}")
			
 
				+                        except Exception as e2:
			
 
				+                            self.log.error(f"插入失败: {e2}")
			
 
				+                    total += rowcount
			
 
				+                else:
			
 
				+                    self.log.exception(f"数据库完整性错误: {e}")
			
 
				+                    if commit:
			
 
				+                        conn.rollback()
			
 
				+                    raise e
			
 
				+            except Exception as e:
			
 
				+                self.log.exception(f"批量插入失败: {e}")
			
 
				+                if commit:
			
 
				+                    conn.rollback()
			
 
				+                raise e
			
 
				+        self.log.info(f"sql insert_many_two, Table: {table}, Total Rows: {total}")
			
 
				+        return total
			
 
				+
			
 
				+    def insert_too_many(self, query, args_list, batch_size=1000):
			
 
				+        """
			
 
				+        执行批量插入语句，分片提交, 单次插入大于十万+时可用, 如果失败则降级为逐条插入
			
 
				+        :param query: 插入语句
			
 
				+        :param args_list: 插入参数列表
			
 
				+        :param batch_size: 每次插入的条数
			
 
				+        """
			
 
				+        self.log.info(f"sql insert_too_many, Query: {query}, Total Rows: {len(args_list)}")
			
 
				+        for i in range(0, len(args_list), batch_size):
			
 
				+            batch = args_list[i:i + batch_size]
			
 
				+            try:
			
 
				+                with self.pool.connection() as conn:
			
 
				+                    with conn.cursor() as cursor:
			
 
				+                        cursor.executemany(query, batch)
			
 
				+                        conn.commit()
			
 
				+                        self.log.debug(f"insert_too_many -> Total Rows: {len(batch)}")
			
 
				+            except Exception as e:
			
 
				+                self.log.error(f"insert_too_many error. Trying single insert. Error: {e}")
			
 
				+                # 当前批次降级为单条插入
			
 
				+                for args in batch:
			
 
				+                    self.insert_one(query, args)
			
 
				+
			
 
				+    def update_one(self, query, args):
			
 
				+        """
			
 
				+        执行单条更新语句
			
 
				+        :param query: 更新语句
			
 
				+        :param args: 更新参数
			
 
				+        """
			
 
				+        self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data update_one 更新中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
			
 
				+        return self._execute(query, args, commit=True)
			
 
				+
			
 
				+    def update_all(self, query, args_list):
			
 
				+        """
			
 
				+        执行批量更新语句，如果失败则逐条更新
			
 
				+        :param query: 更新语句
			
 
				+        :param args_list: 更新参数列表
			
 
				+        """
			
 
				+        conn = None
			
 
				+        cursor = None
			
 
				+        try:
			
 
				+            conn = self.pool.connection()
			
 
				+            cursor = conn.cursor()
			
 
				+            cursor.executemany(query, args_list)
			
 
				+            conn.commit()
			
 
				+            self.log.debug(f"sql update_all, SQL: {query}, Rows: {len(args_list)}")
			
 
				+            self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data update_all 更新中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
			
 
				+        except Exception as e:
			
 
				+            conn.rollback()
			
 
				+            self.log.error(f"Error executing query: {e}")
			
 
				+            # 如果批量更新失败，则逐条更新
			
 
				+            rowcount = 0
			
 
				+            for args in args_list:
			
 
				+                self.update_one(query, args)
			
 
				+                rowcount += 1
			
 
				+            self.log.debug(f'Batch update failed. Updated {rowcount} rows individually.')
			
 
				+        finally:
			
 
				+            if cursor:
			
 
				+                cursor.close()
			
 
				+            if conn:
			
 
				+                conn.close()
			
 
				+
			
 
				+    def update_one_or_dict(self, table=None, data=None, condition=None, query=None, args=None, commit=True):
			
 
				+        """
			
 
				+        单条更新（支持字典或原始SQL）
			
 
				+        :param table: 表名（字典模式必需）
			
 
				+        :param data: 字典数据 {列名: 值}（与 query 二选一）
			
 
				+        :param condition: 更新条件，支持以下格式：
			
 
				+            - 字典: {"id": 1} → "WHERE id = %s"
			
 
				+            - 字符串: "id = 1" → "WHERE id = 1"（需自行确保安全）
			
 
				+            - 元组: ("id = %s", [1]) → "WHERE id = %s"（参数化查询）
			
 
				+        :param query: 直接SQL语句（与 data 二选一）
			
 
				+        :param args: SQL参数（query 模式下必需）
			
 
				+        :param commit: 是否自动提交
			
 
				+        :return: 影响行数
			
 
				+        :raises: ValueError 参数校验失败时抛出
			
 
				+        """
			
 
				+        # 参数校验
			
 
				+        if data is not None:
			
 
				+            if not isinstance(data, dict):
			
 
				+                raise ValueError("Data must be a dictionary")
			
 
				+            if table is None:
			
 
				+                raise ValueError("Table name is required for dictionary update")
			
 
				+            if condition is None:
			
 
				+                raise ValueError("Condition is required for dictionary update")
			
 
				+
			
 
				+            # 构建 SET 子句
			
 
				+            set_clause = ", ".join([f"{self._safe_identifier(k)} = %s" for k in data.keys()])
			
 
				+            set_values = list(data.values())
			
 
				+
			
 
				+            # 解析条件
			
 
				+            condition_clause, condition_args = self._parse_condition(condition)
			
 
				+            query = f"UPDATE {self._safe_identifier(table)} SET {set_clause} WHERE {condition_clause}"
			
 
				+            args = set_values + condition_args
			
 
				+
			
 
				+        elif query is None:
			
 
				+            raise ValueError("Either data or query must be provided")
			
 
				+
			
 
				+        # 执行更新
			
 
				+        cursor = self._execute(query, args, commit)
			
 
				+        # self.log.debug(
			
 
				+        #     f"Updated table={table}, rows={cursor.rowcount}, query={query[:100]}...",
			
 
				+        #     extra={"table": table, "rows": cursor.rowcount}
			
 
				+        # )
			
 
				+        return cursor.rowcount
			
 
				+
			
 
				+    def _parse_condition(self, condition):
			
 
				+        """
			
 
				+        解析条件为 (clause, args) 格式
			
 
				+        :param condition: 字典/字符串/元组
			
 
				+        :return: (str, list) SQL 子句和参数列表
			
 
				+        """
			
 
				+        if isinstance(condition, dict):
			
 
				+            clause = " AND ".join([f"{self._safe_identifier(k)} = %s" for k in condition.keys()])
			
 
				+            args = list(condition.values())
			
 
				+        elif isinstance(condition, str):
			
 
				+            clause = condition  # 注意：需调用方确保安全
			
 
				+            args = []
			
 
				+        elif isinstance(condition, (tuple, list)) and len(condition) == 2:
			
 
				+            clause, args = condition[0], condition[1]
			
 
				+            if not isinstance(args, (list, tuple)):
			
 
				+                args = [args]
			
 
				+        else:
			
 
				+            raise ValueError("Condition must be dict/str/(clause, args)")
			
 
				+        return clause, args
			
 
				+
			
 
				+    def update_many(self, table=None, data_list=None, condition_list=None, query=None, args_list=None, batch_size=500,
			
 
				+                    commit=True):
			
 
				+        """
			
 
				+        批量更新（支持字典列表或原始SQL）
			
 
				+        :param table: 表名（字典插入时必需）
			
 
				+        :param data_list: 字典列表 [{列名: 值}]
			
 
				+        :param condition_list: 条件列表（必须为字典，与data_list等长）
			
 
				+        :param query: 直接SQL语句（与data_list二选一）
			
 
				+        :param args_list: SQL参数列表（query使用时必需）
			
 
				+        :param batch_size: 分批大小
			
 
				+        :param commit: 是否自动提交
			
 
				+        :return: 影响行数
			
 
				+        """
			
 
				+        if data_list is not None:
			
 
				+            if not data_list or not isinstance(data_list[0], dict):
			
 
				+                raise ValueError("Data_list must be a non-empty list of dictionaries")
			
 
				+            if condition_list is None or len(data_list) != len(condition_list):
			
 
				+                raise ValueError("Condition_list must be provided and match the length of data_list")
			
 
				+            if not all(isinstance(cond, dict) for cond in condition_list):
			
 
				+                raise ValueError("All elements in condition_list must be dictionaries")
			
 
				+
			
 
				+            # 获取第一个数据项和条件项的键
			
 
				+            first_data_keys = set(data_list[0].keys())
			
 
				+            first_cond_keys = set(condition_list[0].keys())
			
 
				+
			
 
				+            # 构造基础SQL
			
 
				+            set_clause = ', '.join([self._safe_identifier(k) + ' = %s' for k in data_list[0].keys()])
			
 
				+            condition_clause = ' AND '.join([self._safe_identifier(k) + ' = %s' for k in condition_list[0].keys()])
			
 
				+            base_query = f"UPDATE {self._safe_identifier(table)} SET {set_clause} WHERE {condition_clause}"
			
 
				+            total = 0
			
 
				+
			
 
				+            # 分批次处理
			
 
				+            for i in range(0, len(data_list), batch_size):
			
 
				+                batch_data = data_list[i:i + batch_size]
			
 
				+                batch_conds = condition_list[i:i + batch_size]
			
 
				+                batch_args = []
			
 
				+
			
 
				+                # 检查当前批次的结构是否一致
			
 
				+                can_batch = True
			
 
				+                for data, cond in zip(batch_data, batch_conds):
			
 
				+                    data_keys = set(data.keys())
			
 
				+                    cond_keys = set(cond.keys())
			
 
				+                    if data_keys != first_data_keys or cond_keys != first_cond_keys:
			
 
				+                        can_batch = False
			
 
				+                        break
			
 
				+                    batch_args.append(tuple(data.values()) + tuple(cond.values()))
			
 
				+
			
 
				+                if not can_batch:
			
 
				+                    # 结构不一致，转为单条更新
			
 
				+                    for data, cond in zip(batch_data, batch_conds):
			
 
				+                        self.update_one_or_dict(table=table, data=data, condition=cond, commit=commit)
			
 
				+                        total += 1
			
 
				+                    continue
			
 
				+
			
 
				+                # 执行批量更新
			
 
				+                try:
			
 
				+                    with self.pool.connection() as conn:
			
 
				+                        with conn.cursor() as cursor:
			
 
				+                            cursor.executemany(base_query, batch_args)
			
 
				+                            if commit:
			
 
				+                                conn.commit()
			
 
				+                            total += cursor.rowcount
			
 
				+                            self.log.debug(f"Batch update succeeded. Rows: {cursor.rowcount}")
			
 
				+                except Exception as e:
			
 
				+                    if commit:
			
 
				+                        conn.rollback()
			
 
				+                    self.log.error(f"Batch update failed: {e}")
			
 
				+                    # 降级为单条更新
			
 
				+                    for args, data, cond in zip(batch_args, batch_data, batch_conds):
			
 
				+                        try:
			
 
				+                            self._execute(base_query, args, commit=commit)
			
 
				+                            total += 1
			
 
				+                        except Exception as e2:
			
 
				+                            self.log.error(f"Single update failed: {e2}, Data: {data}, Condition: {cond}")
			
 
				+            self.log.info(f"Total updated rows: {total}")
			
 
				+            return total
			
 
				+        elif query is not None:
			
 
				+            # 处理原始SQL和参数列表
			
 
				+            if args_list is None:
			
 
				+                raise ValueError("args_list must be provided when using query")
			
 
				+
			
 
				+            total = 0
			
 
				+            for i in range(0, len(args_list), batch_size):
			
 
				+                batch_args = args_list[i:i + batch_size]
			
 
				+                try:
			
 
				+                    with self.pool.connection() as conn:
			
 
				+                        with conn.cursor() as cursor:
			
 
				+                            cursor.executemany(query, batch_args)
			
 
				+                            if commit:
			
 
				+                                conn.commit()
			
 
				+                            total += cursor.rowcount
			
 
				+                            self.log.debug(f"Batch update succeeded. Rows: {cursor.rowcount}")
			
 
				+                except Exception as e:
			
 
				+                    if commit:
			
 
				+                        conn.rollback()
			
 
				+                    self.log.error(f"Batch update failed: {e}")
			
 
				+                    # 降级为单条更新
			
 
				+                    for args in batch_args:
			
 
				+                        try:
			
 
				+                            self._execute(query, args, commit=commit)
			
 
				+                            total += 1
			
 
				+                        except Exception as e2:
			
 
				+                            self.log.error(f"Single update failed: {e2}, Args: {args}")
			
 
				+            self.log.info(f"Total updated rows: {total}")
			
 
				+            return total
			
 
				+        else:
			
 
				+            raise ValueError("Either data_list or query must be provided")
			
 
				+
			
 
				+    def check_pool_health(self):
			
 
				+        """
			
 
				+        检查连接池中有效连接数
			
 
				+
			
 
				+        # 使用示例
			
 
				+        # 配置 MySQL 连接池
			
 
				+        sql_pool = MySQLConnectionPool(log=log)
			
 
				+        if not sql_pool.check_pool_health():
			
 
				+            log.error("数据库连接池异常")
			
 
				+            raise RuntimeError("数据库连接池异常")
			
 
				+        """
			
 
				+        try:
			
 
				+            with self.pool.connection() as conn:
			
 
				+                conn.ping(reconnect=True)
			
 
				+                return True
			
 
				+        except Exception as e:
			
 
				+            self.log.error(f"Connection pool health check failed: {e}")
			
 
				+            return False
			
 
				+
			
 
				+    def close(self):
			
 
				+        """
			
 
				+        关闭连接池，释放所有连接
			
 
				+        """
			
 
				+        try:
			
 
				+            if hasattr(self, 'pool') and self.pool:
			
 
				+                self.pool.close()
			
 
				+                self.log.info("数据库连接池已关闭")
			
 
				+        except Exception as e:
			
 
				+            self.log.error(f"关闭连接池失败: {e}")
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _safe_identifier(name):
			
 
				+        """SQL标识符安全校验"""
			
 
				+        if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
			
 
				+            raise ValueError(f"Invalid SQL identifier: {name}")
			
 
				+        return name
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    sql_pool = MySQLConnectionPool()
			
 
				+    data_dic = {'card_type_id': 111, 'card_type_name': '补充包 继承的意志【OPC-13】', 'card_type_position': 964,
			
 
				+                'card_id': 5284, 'card_name': '蒙奇·D·路飞', 'card_number': 'OP13-001', 'card_rarity': 'L',
			
 
				+                'card_img': 'https://source.windoent.com/OnePiecePc/Picture/1757929283612OP13-001.png',
			
 
				+                'card_life': '4', 'card_attribute': '打', 'card_power': '5000', 'card_attack': '-',
			
 
				+                'card_color': '红/绿', 'subscript': 4, 'card_features': '超新星/草帽一伙',
			
 
				+                'card_text_desc': '【咚!!×1】【对方的攻击时】我方处于活跃状态的咚!!不多于5张的场合，可以将我方任意张数的咚!!转为休息状态。每有1张转为休息状态的咚!!，本次战斗中，此领袖或我方最多1张拥有《草帽一伙》特征的角色力量+2000。',
			
 
				+                'card_offer_type': '补充包 继承的意志【OPC-13】', 'crawler_language': '简中'}
			
 
				+    sql_pool.insert_one_or_dict(table="one_piece_record", data=data_dic)
			
--- a/wheatland_spider/requirements.txt
+++ b/wheatland_spider/requirements.txt
@@ -0,0 +1,9 @@
 
				+-i https://mirrors.aliyun.com/pypi/simple/
			
 
				+curl_cffi==0.15.1b1
			
 
				+DBUtils==3.1.2
			
 
				+loguru==0.7.3
			
 
				+parsel==1.11.0
			
 
				+PyMySQL==1.1.2
			
 
				+PyYAML==6.0.3
			
 
				+schedule==1.2.2
			
 
				+tenacity==9.1.4
			
--- a/wheatland_spider/wheatland_core.py
+++ b/wheatland_spider/wheatland_core.py
@@ -0,0 +1,401 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# Author : Charley
			
 
				+# Python : 3.12.10
			
 
				+# Date   : 2026/5/29
			
 
				+"""
			
 
				+Wheatland 公用模块：HTTP 配置、ASP.NET postback 切换 auction、列表解析、详情图解析。
			
 
				+被 wheatland_history.py / wheatland_spider.py 复用。
			
 
				+
			
 
				+目标网站: https://wheatlandauctionservices.com/auctionresults.aspx
			
 
				+逻辑要点:
			
 
				+    1. ASP.NET WebForms (.aspx)，每次 POST 都要带 __VIEWSTATE / __VIEWSTATEGENERATOR / __EVENTVALIDATION。
			
 
				+    2. 顶部 "Select Auction" 下拉框 onchange 触发 __doPostBack(AuctionDDL,'')，POST 回原 URL。
			
 
				+    3. 切换 auction 后 #SearchGrid 表格直接渲染该场次全部 lot（实测无翻页）。
			
 
				+    4. 详情页 LotDetail.aspx?inventoryid=xxx 的 #ThumbPanel 内即为多图链接。
			
 
				+"""
			
 
				+import random
			
 
				+from loguru import logger
			
 
				+from parsel import Selector
			
 
				+from curl_cffi import requests
			
 
				+from curl_cffi.requests import BrowserType
			
 
				+from urllib.parse import urljoin, urlparse, parse_qs
			
 
				+from tenacity import retry, stop_after_attempt, wait_fixed
			
 
				+
			
 
				+BASE_URL = "https://wheatlandauctionservices.com/auctionresults.aspx"
			
 
				+
			
 
				+# 直接用库内置的所有浏览器指纹
			
 
				+client_identifier_list = [b.value for b in BrowserType]
			
 
				+
			
 
				+headers = {
			
 
				+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+    "accept-language": "en-US,en;q=0.9",
			
 
				+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
			
 
				+                  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
			
 
				+}
			
 
				+
			
 
				+# ASP.NET 控件 name（前缀固定）
			
 
				+AUCTION_DDL_NAME = "ctl00$ContentPlaceHolder$AuctionSelector$AuctionDDL"
			
 
				+SEARCH_TB_NAME = "ctl00$ContentPlaceHolder$SearchTB"
			
 
				+SEARCH_BY_DDL_NAME = "ctl00$ContentPlaceHolder$SearchByDDL"
			
 
				+
			
 
				+
			
 
				+def after_log(retry_state):
			
 
				+    """tenacity retry 回调，统一打印重试日志。
			
 
				+
			
 
				+    Args:
			
 
				+        retry_state: tenacity.RetryCallState，retry 框架自动传入。
			
 
				+    """
			
 
				+    if retry_state.args and len(retry_state.args) > 0:
			
 
				+        log = retry_state.args[0]
			
 
				+    else:
			
 
				+        log = logger
			
 
				+
			
 
				+    if retry_state.outcome.failed:
			
 
				+        log.warning(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
			
 
				+    else:
			
 
				+        log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
			
 
				+
			
 
				+
			
 
				+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
			
 
				+def get_proxys(log):
			
 
				+    """获取代理字典，全部请求方法的 proxies 参数都走这里。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: requests 风格 {"http": ..., "https": ...} 代理字典；不需要代理时返回 None。
			
 
				+
			
 
				+    Raises:
			
 
				+        Exception: 透传内部异常以便 tenacity 触发重试。
			
 
				+    """
			
 
				+    http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
			
 
				+    https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
			
 
				+    try:
			
 
				+        return {"http": http_proxy, "https": https_proxy}
			
 
				+    except Exception as e:
			
 
				+        log.error(f"Error getting proxy: {e}")
			
 
				+        raise e
			
 
				+
			
 
				+
			
 
				+def _pick_hidden(selector, field_name):
			
 
				+    """从页面提取 ASP.NET 隐藏字段（__VIEWSTATE 等）的 value。
			
 
				+
			
 
				+    Args:
			
 
				+        selector (Selector): parsel.Selector 对象。
			
 
				+        field_name (str): 隐藏字段的 name，如 __VIEWSTATE。
			
 
				+
			
 
				+    Returns:
			
 
				+        str: 隐藏字段的值，未找到返回空字符串。
			
 
				+    """
			
 
				+    return selector.xpath(f'//input[@name="{field_name}"]/@value').get() or ""
			
 
				+
			
 
				+
			
 
				+def extract_state(selector):
			
 
				+    """抽取 ASP.NET WebForms 三个隐藏字段。
			
 
				+
			
 
				+    Args:
			
 
				+        selector (Selector): 当前页面 parsel 解析对象。
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: 含 __VIEWSTATE / __VIEWSTATEGENERATOR / __EVENTVALIDATION 三个键的字典。
			
 
				+
			
 
				+    Raises:
			
 
				+        ValueError: 当页面缺失任一隐藏字段时抛出（说明响应异常）。
			
 
				+    """
			
 
				+    state = {}
			
 
				+    for name in ("__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"):
			
 
				+        value = _pick_hidden(selector, name)
			
 
				+        if not value:
			
 
				+            raise ValueError(f"页面缺失隐藏字段: {name}")
			
 
				+        state[name] = value
			
 
				+    return state
			
 
				+
			
 
				+
			
 
				+def parse_auction_options(selector):
			
 
				+    """从首页解析 Select Auction 下拉框，得到所有可选场次。
			
 
				+
			
 
				+    过滤规则: 跳过 value=-1 的 "All"，以及名字里包含 "Test" 的测试场次。
			
 
				+
			
 
				+    Args:
			
 
				+        selector (Selector): 首页 GET 响应的 parsel 解析对象。
			
 
				+
			
 
				+    Returns:
			
 
				+        list[dict]: 每个元素为 {"id": "49", "name": "April 2026 ..."}。
			
 
				+
			
 
				+    Raises:
			
 
				+        ValueError: 当页面找不到 #AuctionDDL 下拉框时抛出。
			
 
				+    """
			
 
				+    select = selector.css('select#AuctionDDL')
			
 
				+    if not select:
			
 
				+        raise ValueError("找不到 #AuctionDDL 下拉框")
			
 
				+
			
 
				+    result = []
			
 
				+    for opt in select.css('option'):
			
 
				+        aid = (opt.attrib.get("value") or "").strip()
			
 
				+        name = opt.xpath('normalize-space(.)').get() or ""
			
 
				+        # 跳过 "All"（value=-1）、空 value、以及测试场次（名字含 "Test"）
			
 
				+        if not aid or aid == "-1":
			
 
				+            continue
			
 
				+        if "test" in name.lower():
			
 
				+            continue
			
 
				+        result.append({"id": aid, "name": name})
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+@retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log)
			
 
				+def get_auction_list(log, session, impersonate):
			
 
				+    """GET 首页，解析出全部 auction 列表（已过滤 All / Test）。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        session (requests.Session): curl_cffi 会话对象。
			
 
				+        impersonate (str): 浏览器指纹标识，与 setup 时一致。
			
 
				+
			
 
				+    Returns:
			
 
				+        list[dict]: [{"id": "49", "name": "April 2026 ..."}, ...]。
			
 
				+    """
			
 
				+    log.info("获取全部 auction 列表")
			
 
				+    resp = session.get(BASE_URL, headers=headers, impersonate=impersonate,
			
 
				+                       proxies=get_proxys(log), timeout=15)
			
 
				+    resp.raise_for_status()
			
 
				+    sel = Selector(resp.text)
			
 
				+    auctions = parse_auction_options(sel)
			
 
				+    log.info(f"共解析到 {len(auctions)} 个 auction：{[(a['id'], a['name']) for a in auctions[:3]]}...")
			
 
				+    return auctions
			
 
				+
			
 
				+
			
 
				+def parse_inventory_id(href):
			
 
				+    """从 LotDetail.aspx?inventoryid=xxx 中提取 inventoryid。
			
 
				+
			
 
				+    Args:
			
 
				+        href (str): <a> 标签的 href 属性，例如 "LotDetail.aspx?inventoryid=35980"。
			
 
				+
			
 
				+    Returns:
			
 
				+        str: inventoryid 字符串，未匹配到时返回空串。
			
 
				+    """
			
 
				+    if not href:
			
 
				+        return ""
			
 
				+    qs = parse_qs(urlparse(href).query)
			
 
				+    return qs.get("inventoryid", [""])[0]
			
 
				+
			
 
				+
			
 
				+def parse_search_grid(selector, auction_id, auction_name):
			
 
				+    """解析 #SearchGrid 表格所有 lot 行，每行组装成 dict。
			
 
				+
			
 
				+    表头固定为: Auction Name | Lot Number | Title | Min Bid | Final Price | Status。
			
 
				+
			
 
				+    Args:
			
 
				+        selector (Selector): POST 响应的 parsel 解析对象。
			
 
				+        auction_id (str): 当前 auction 的 id，回填到行数据。
			
 
				+        auction_name (str): 当前 auction 的显示名，回填到行数据。
			
 
				+
			
 
				+    Returns:
			
 
				+        list[dict]: 每行 lot 一条 dict；若表格不存在则返回空列表。
			
 
				+    """
			
 
				+    grid = selector.css('table#SearchGrid')
			
 
				+    if not grid:
			
 
				+        return []
			
 
				+
			
 
				+    rows = []
			
 
				+    # 直系 tr，跳过第一行表头（class="color_c"）
			
 
				+    trs = grid.xpath('./tr | ./tbody/tr')[1:]
			
 
				+    for tr in trs:
			
 
				+        tds = tr.xpath('./td')
			
 
				+        if len(tds) < 6:
			
 
				+            continue  # 异常行跳过
			
 
				+
			
 
				+        # 第 2 列 Lot Number 含 <a href='LotDetail.aspx?inventoryid=xxx'>
			
 
				+        lot_href = tds[1].css('a::attr(href)').get() or ""
			
 
				+        inventory_id = parse_inventory_id(lot_href)
			
 
				+        detail_url = urljoin(BASE_URL, lot_href) if lot_href else ""
			
 
				+
			
 
				+        rows.append({
			
 
				+            "auction_id": auction_id,  # 场次 id
			
 
				+            "auction_name": auction_name,  # 场次名
			
 
				+            "lot_number": tds[1].xpath('normalize-space(.)').get() or "",  # 第 2 列 Lot Number
			
 
				+            "inventory_id": inventory_id,  # 从 LotDetail 链接抽出
			
 
				+            "title": tds[2].xpath('normalize-space(.)').get() or "",  # 第 3 列 Title
			
 
				+            "min_bid": tds[3].xpath('normalize-space(.)').get() or "",  # 第 4 列 Min Bid
			
 
				+            "final_price": tds[4].xpath('normalize-space(.)').get() or "",  # 第 5 列 Final Price
			
 
				+            "status": tds[5].xpath('normalize-space(.)').get() or "",  # 第 6 列 Status
			
 
				+            "detail_url": detail_url,  # 详情绝对 URL
			
 
				+        })
			
 
				+    return rows
			
 
				+
			
 
				+
			
 
				+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
			
 
				+def fetch_auction_lots(log, session, impersonate, auction_id, auction_name):
			
 
				+    """通过 __doPostBack 切换到指定 auction 并解析整张列表。
			
 
				+
			
 
				+    单次 POST 即返回该 auction 全部 lot（无翻页）。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        session (requests.Session): curl_cffi 会话对象。
			
 
				+        impersonate (str): 浏览器指纹标识。
			
 
				+        auction_id (str): 目标 auction 的 option value。
			
 
				+        auction_name (str): 目标 auction 的显示名。
			
 
				+
			
 
				+    Returns:
			
 
				+        list[dict]: parse_search_grid 解析出的 lot 列表。
			
 
				+    """
			
 
				+    log.info(f"切换并抓取 auction={auction_id} ({auction_name})")
			
 
				+    proxies = get_proxys(log)
			
 
				+
			
 
				+    # 1) 首次 GET 拿 VIEWSTATE
			
 
				+    resp = session.get(BASE_URL, headers=headers, impersonate=impersonate,
			
 
				+                       proxies=proxies, timeout=15)
			
 
				+    resp.raise_for_status()
			
 
				+    sel = Selector(resp.text)
			
 
				+    state = extract_state(sel)
			
 
				+
			
 
				+    # 2) POST 切换 auction（等价于网页上 onchange 触发的 __doPostBack）
			
 
				+    form_data = {
			
 
				+        "__EVENTTARGET": AUCTION_DDL_NAME,  # 触发 postback 的控件
			
 
				+        "__EVENTARGUMENT": "",
			
 
				+        "__LASTFOCUS": "",
			
 
				+        "__VIEWSTATE": state["__VIEWSTATE"],
			
 
				+        "__VIEWSTATEGENERATOR": state["__VIEWSTATEGENERATOR"],
			
 
				+        "__EVENTVALIDATION": state["__EVENTVALIDATION"],
			
 
				+        AUCTION_DDL_NAME: str(auction_id),  # 被选中的 auction value
			
 
				+        SEARCH_TB_NAME: "",  # 搜索框留空 → 全量
			
 
				+        SEARCH_BY_DDL_NAME: "1",  # Search By 默认 Title
			
 
				+    }
			
 
				+    post_headers = {
			
 
				+        **headers,
			
 
				+        "content-type": "application/x-www-form-urlencoded",
			
 
				+        "referer": BASE_URL,
			
 
				+        "origin": "https://wheatlandauctionservices.com",
			
 
				+    }
			
 
				+    resp2 = session.post(BASE_URL, headers=post_headers, data=form_data,
			
 
				+                         impersonate=impersonate, proxies=proxies, timeout=20)
			
 
				+    resp2.raise_for_status()
			
 
				+
			
 
				+    sel2 = Selector(resp2.text)
			
 
				+    lots = parse_search_grid(sel2, str(auction_id), auction_name)
			
 
				+    log.info(f"  auction={auction_id} 列表解析到 {len(lots)} 条 lot")
			
 
				+    return lots
			
 
				+
			
 
				+
			
 
				+@retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log)
			
 
				+def fetch_lot_images(log, session, impersonate, detail_url):
			
 
				+    """进入 LotDetail.aspx 详情页，抓取 #ThumbPanel 内所有多图链接。
			
 
				+
			
 
				+    页面结构: <div id="ThumbPanel"> 下每张图为
			
 
				+    <a href="/ItemImages/.../xxx_lg.jpeg"><img src="/ItemImages/.../xxx_med.jpeg"></a>。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        session (requests.Session): curl_cffi 会话对象。
			
 
				+        impersonate (str): 浏览器指纹标识。
			
 
				+        detail_url (str): LotDetail.aspx?inventoryid=xxx 的绝对 URL。
			
 
				+
			
 
				+    Returns:
			
 
				+        list[dict]: 每张图一条 {"large": 大图URL, "thumb": 缩略图URL}；无图返回空列表。
			
 
				+    """
			
 
				+    log.debug(f"获取详情图 {detail_url}")
			
 
				+    resp = session.get(detail_url, headers=headers, impersonate=impersonate,
			
 
				+                       proxies=get_proxys(log), timeout=15)
			
 
				+    resp.raise_for_status()
			
 
				+
			
 
				+    sel = Selector(resp.text)
			
 
				+    panel = sel.css('div#ThumbPanel')
			
 
				+    if not panel:
			
 
				+        return []
			
 
				+
			
 
				+    images = []
			
 
				+    for a in panel.css('a[href]'):
			
 
				+        large = (a.attrib.get("href") or "").strip()
			
 
				+        # 排除 highslide 功能性 <a>（href="#"）
			
 
				+        if not large or large.startswith("#"):
			
 
				+            continue
			
 
				+        thumb = (a.css('img::attr(src)').get() or "").strip()
			
 
				+        images.append({
			
 
				+            "large": urljoin(detail_url, large),  # 拼绝对 URL
			
 
				+            "thumb": urljoin(detail_url, thumb) if thumb else "",
			
 
				+        })
			
 
				+    return images
			
 
				+
			
 
				+
			
 
				+def crawl_one_auction(log, sql_pool, session, impersonate,
			
 
				+                      auction_id, auction_name):
			
 
				+    """抓取单个 auction 的全部 lot 列表（只抓列表，不进详情页）。
			
 
				+
			
 
				+    与 lelands 一致的两阶段设计：本函数只负责列表入库；详情多图由后续
			
 
				+    update_details_for_pending 扫库 state != 1 的记录单独补抓，二者分离。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        sql_pool: MySQL 连接池；传 None 时只返回数据，不入库。
			
 
				+        session (requests.Session): curl_cffi 会话对象。
			
 
				+        impersonate (str): 浏览器指纹标识。
			
 
				+        auction_id (str): 目标 auction id。
			
 
				+        auction_name (str): 目标 auction 显示名。
			
 
				+
			
 
				+    Returns:
			
 
				+        list[dict]: 该 auction 全部 lot 列表数据（不含详情图，详情阶段再补）。
			
 
				+    """
			
 
				+    lots = fetch_auction_lots(log, session, impersonate, auction_id, auction_name)
			
 
				+
			
 
				+    # 入库（接 mysql_pool 时此处会真正写库，state 默认 0 待补详情）
			
 
				+    if sql_pool is not None and lots:
			
 
				+        sql_pool.insert_many(table="wheatland_record", data_list=lots, ignore=True)
			
 
				+
			
 
				+    log.info(f"auction={auction_id}({auction_name}) 共抓 {len(lots)} 条 lot")
			
 
				+    return lots
			
 
				+
			
 
				+
			
 
				+def get_details(log, url, sql_pool, sql_id):
			
 
				+    """对单条已入库记录补抓详情多图，写回 wheatland_record。
			
 
				+
			
 
				+    复用 fetch_lot_images 解析 #ThumbPanel；入库 imgs 字段存大图链接逗号拼接
			
 
				+    （如需 thumb，可改存 fetch_lot_images 返回的 large+thumb JSON）。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        url (str): 详情页 URL。
			
 
				+        sql_pool: MySQL 连接池。
			
 
				+        sql_id: 数据库记录 id。
			
 
				+    """
			
 
				+    log.info(f">>> 补抓详情 {url}")
			
 
				+    impersonate = random.choice(client_identifier_list)
			
 
				+    with requests.Session() as session:
			
 
				+        images = fetch_lot_images(log, session, impersonate, url)
			
 
				+
			
 
				+    imgs_str = ",".join(img["large"] for img in images if img["large"]) if images else None
			
 
				+    # print(imgs_str)
			
 
				+
			
 
				+    sql_pool.update_one_or_dict(
			
 
				+        table="wheatland_record",
			
 
				+        data={"imgs": imgs_str, "state": 1},
			
 
				+        condition={"id": sql_id},
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def update_details_for_pending(log, sql_pool):
			
 
				+    """扫库里 state != 1 的记录，逐条补抓详情图。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        sql_pool: MySQL 连接池。
			
 
				+    """
			
 
				+    log.debug("Updating detail pages ...")
			
 
				+    rows = sql_pool.select_all(
			
 
				+        "select id, detail_url from wheatland_record where state != 1"
			
 
				+    )
			
 
				+    for row in rows:
			
 
				+        sql_id, detail_url = row[0], row[1]
			
 
				+        try:
			
 
				+            get_details(log, detail_url, sql_pool, sql_id)
			
 
				+        except Exception as e:
			
 
				+            log.error(f"Error getting details for {detail_url}: {e}")
			
 
				+            sql_pool.update_one_or_dict(
			
 
				+                table="wheatland_record",
			
 
				+                data={"state": 2},
			
 
				+                condition={"id": sql_id},
			
 
				+            )
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    get_details(logger,'https://wheatlandauctionservices.com/LotDetail.aspx?inventoryid=15233', None, 1)
			
--- a/wheatland_spider/wheatland_spider.py
+++ b/wheatland_spider/wheatland_spider.py
@@ -0,0 +1,170 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# Author : Charley
			
 
				+# Python : 3.12.10
			
 
				+# Date   : 2026/5/29 13:52
			
 
				+"""
			
 
				+Wheatland 增量爬虫（日调度）
			
 
				+逻辑（两阶段，与 lelands 一致）：
			
 
				+  1. GET 首页解析当前全部 auction id
			
 
				+  2. 查库 select distinct auction_id from wheatland_record，得到已爬过的 auction
			
 
				+  3. 差集 = 新增 auction
			
 
				+  4. 没有新增 → 本轮无数据可抓，结束
			
 
				+  5. 阶段一 列表：对每个新增 auction postback 切换 → 解析整张列表入库
			
 
				+  6. 阶段二 详情：扫库 state != 1 的记录 → 逐条进 LotDetail 抓多图写回
			
 
				+
			
 
				+目标网站: https://wheatlandauctionservices.com/auctionresults.aspx
			
 
				+"""
			
 
				+import time
			
 
				+import random
			
 
				+import inspect
			
 
				+
			
 
				+import schedule
			
 
				+from curl_cffi import requests
			
 
				+from loguru import logger
			
 
				+from tenacity import retry, stop_after_attempt, wait_fixed
			
 
				+
			
 
				+from wheatland_core import (
			
 
				+    client_identifier_list,
			
 
				+    crawl_one_auction,
			
 
				+    get_auction_list,
			
 
				+    update_details_for_pending,
			
 
				+    after_log,
			
 
				+)
			
 
				+
			
 
				+logger.remove()
			
 
				+logger.add("./logs/{time:YYYYMMDD}.log", encoding="utf-8", rotation="00:00",
			
 
				+           format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
			
 
				+           level="DEBUG", retention="7 day")
			
 
				+
			
 
				+
			
 
				+def get_existing_auction_ids(log, sql_pool):
			
 
				+    """查库返回已爬过的 auction_id 集合。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        sql_pool: MySQL 连接池；为 None 时返回空集合（首次跑）。
			
 
				+
			
 
				+    Returns:
			
 
				+        set[str]: 已存在的 auction_id 字符串集合。
			
 
				+    """
			
 
				+    if sql_pool is None:
			
 
				+        log.warning("sql_pool 为 None，视为库内无任何 auction（将全量重抓）")
			
 
				+        return set()
			
 
				+
			
 
				+    rows = sql_pool.select_all(
			
 
				+        "select distinct auction_id from wheatland_record where auction_id is not null"
			
 
				+    )
			
 
				+    ids = {str(r[0]) for r in rows} if rows else set()
			
 
				+    log.info(f"库中已存在 {len(ids)} 个 auction_id: {sorted(ids)}")
			
 
				+    return ids
			
 
				+
			
 
				+
			
 
				+def diff_new_auctions(log, all_auctions, existing_ids):
			
 
				+    """从首页解析的全部 auctions 中筛出库里没有的。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        all_auctions (list[dict]): get_auction_list 返回的全部 auction。
			
 
				+        existing_ids (set[str]): 已存在的 auction_id 集合。
			
 
				+
			
 
				+    Returns:
			
 
				+        list[dict]: 待抓取的新 auction 列表。
			
 
				+    """
			
 
				+    new_list = [a for a in all_auctions if a["id"] not in existing_ids]
			
 
				+    log.info(f"新增待抓取 auction 数: {len(new_list)} -> {[(a['id'], a['name']) for a in new_list]}")
			
 
				+    return new_list
			
 
				+
			
 
				+
			
 
				+def run_incremental(log, sql_pool):
			
 
				+    """增量抓取主流程。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+        sql_pool: MySQL 连接池；为 None 时不入库，仅在内存收集 print 样本。
			
 
				+    """
			
 
				+    impersonate = random.choice(client_identifier_list)
			
 
				+    with requests.Session() as session:
			
 
				+        try:
			
 
				+            all_auctions = get_auction_list(log, session, impersonate)
			
 
				+        except Exception as e:
			
 
				+            log.error(f"获取 auction 列表失败: {e}")
			
 
				+            return
			
 
				+
			
 
				+        existing_ids = get_existing_auction_ids(log, sql_pool)
			
 
				+        new_auctions = diff_new_auctions(log, all_auctions, existing_ids)
			
 
				+
			
 
				+        if not new_auctions:
			
 
				+            log.info("本轮无新增 auction，跳过 list 抓取")
			
 
				+            return
			
 
				+
			
 
				+        collected = []
			
 
				+        for idx, auc in enumerate(new_auctions, 1):
			
 
				+            aid, name = auc["id"], auc["name"]
			
 
				+            log.info(f"========== [{idx}/{len(new_auctions)}] 开始抓 auction={aid} ({name}) ==========")
			
 
				+            try:
			
 
				+                lots = crawl_one_auction(log, sql_pool, session, impersonate,
			
 
				+                                         auction_id=aid, auction_name=name)
			
 
				+                if sql_pool is None:
			
 
				+                    collected.extend(lots)
			
 
				+            except Exception as e:
			
 
				+                log.error(f"auction={aid} 抓取异常: {e}")
			
 
				+                continue
			
 
				+
			
 
				+        if sql_pool is None:
			
 
				+            log.info(f"增量抓取结束，共 {len(collected)} 条 lot（未入库）")
			
 
				+            for row in collected[:3]:
			
 
				+                print(row)
			
 
				+
			
 
				+
			
 
				+@retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
			
 
				+def wld_main(log):
			
 
				+    """日调度主函数：增量 list + 补详情。
			
 
				+
			
 
				+    Args:
			
 
				+        log: logger 对象。
			
 
				+    """
			
 
				+    log.info(f"开始运行 {inspect.currentframe().f_code.co_name} 增量爬虫任务 ...")
			
 
				+
			
 
				+    # 主公自己接 MySQL；暂时传 None，crawl_one_auction 会跳过入库
			
 
				+    sql_pool = None
			
 
				+
			
 
				+    try:
			
 
				+        # 阶段一：抓新增 auction 的列表入库
			
 
				+        try:
			
 
				+            run_incremental(log, sql_pool)
			
 
				+        except Exception as e:
			
 
				+            log.error(f"增量抓取失败: {e}")
			
 
				+
			
 
				+        # 阶段二：扫库 state != 1 的记录补抓详情多图（未接库时跳过）
			
 
				+        if sql_pool is not None:
			
 
				+            try:
			
 
				+                update_details_for_pending(log, sql_pool)
			
 
				+            except Exception as e:
			
 
				+                log.error(f"详情补抓失败: {e}")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        log.error(f"{inspect.currentframe().f_code.co_name} error: {e}")
			
 
				+    finally:
			
 
				+        log.info(f"爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束，等待下一轮采集 ...")
			
 
				+
			
 
				+
			
 
				+def schedule_task():
			
 
				+    """启动调度器：先即时跑一次，之后每月 1 号和 15 号 05:00 跑一次（半月一次）。"""
			
 
				+    wld_main(log=logger)
			
 
				+
			
 
				+    def run_semimonthly():
			
 
				+        from datetime import date
			
 
				+        if date.today().day in (1, 15):
			
 
				+            wld_main(log=logger)
			
 
				+
			
 
				+    schedule.every().day.at("05:00").do(run_semimonthly)
			
 
				+    while True:
			
 
				+        schedule.run_pending()
			
 
				+        time.sleep(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 测试时直接跑一次
			
 
				+    # wld_main(log=logger)
			
 
				+    # 上生产再切回 schedule
			
 
				+    schedule_task()