Explorar o código

update baocui task 26.1.21.1

charley hai 1 semana
pai
achega
4aef6c203b
Modificáronse 3 ficheiros con 212 adicións e 70 borrados
  1. 48 0
      baocui_spider/add_task.py
  2. 1 1
      baocui_spider/bc_spider.py
  3. 163 69
      baocui_spider/mysql_pool.py

+ 48 - 0
baocui_spider/add_task.py

@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Author : Charley
+# Python : 3.10.8
+# Date   : 2025/6/11 18:59
+from loguru import logger
+
+from mysql_pool import MySQLConnectionPool
+
+# logger.remove()
+# logger.add("./logs/add_{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
+#            format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
+#            level="DEBUG", retention="7 day")
+
+"""
+C001000000,C开头到100万
+D000220000,D开头到22万
+E000050000,E开头到5万
+H000500000,H开头到50万
+L000400000,L开头到40万
+"""
+
+start = 1
+sql_pool = MySQLConnectionPool(log=logger)
+# sql_kwd_list = sql_pool.select_all("select keyword from baocui_record")
+# sql_kwd_list = [i[0] for i in sql_kwd_list]
+
+# for prefix, end in [('C', 600001), ('D', 200001), ('E', 40001)]:
+for prefix, end in [('L', 400001)]:
+    # if prefix == 'H':
+    #     start = 40001
+    info_list = []
+    for i in range(start, end):
+        kwd = f"{prefix}{i:09d}"
+        # if kwd in sql_kwd_list:
+        #     logger.info(f"{kwd}已存在")
+        #     continue
+        info_list.append((kwd,))  # 添加元组格式以匹配参数化查询
+    # print(info_list)
+
+    #     # 每10000条保存一次
+    #     if len(info_list) >= 10000:
+    #         sql_pool.insert_all("insert into baocui_task(keyword) values (%s)", info_list)
+    #         info_list.clear()
+    #
+    # # 处理剩余不足10000条的部分
+    if info_list:
+        sql_pool.insert_many(query="INSERT IGNORE INTO baocui_task(keyword) values (%s)", args_list=info_list, ignore=True)
+        info_list.clear()

+ 1 - 1
baocui_spider/bc_spider.py

@@ -6,7 +6,7 @@ import inspect
 import requests
 import user_agent
 from loguru import logger
-from mysq_pool import MySQLConnectionPool
+from mysql_pool import MySQLConnectionPool
 from tenacity import retry, stop_after_attempt, wait_fixed
 
 """

+ 163 - 69
baocui_spider/mysql_pool.py

@@ -66,7 +66,7 @@ class MySQLConnectionPool:
         except Exception as e:
             if commit:
                 conn.rollback()
-            self.log.error(f"Error executing query: {e}, Query: {query}, Args: {args}")
+            self.log.exception(f"Error executing query: {e}, Query: {query}, Args: {args}")
             raise e
 
     def select_one(self, query, args=None):
@@ -101,7 +101,7 @@ class MySQLConnectionPool:
 
     def insert_all(self, query, args_list):
         """
-        执行批量插入语句如果失败则逐条插入
+        执行批量插入语句,如果失败则逐条插入
         :param query: 插入语句
         :param args_list: 插入参数列表
         """
@@ -112,24 +112,40 @@ class MySQLConnectionPool:
             cursor = conn.cursor()
             cursor.executemany(query, args_list)
             conn.commit()
-            self.log.debug(f"sql insert_all, SQL: {query}, Rows: {len(args_list)}")
+            self.log.debug(f"sql insert_all, SQL: {query[:100]}..., Rows: {cursor.rowcount}")
             self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data insert_all 入库中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
+        except pymysql.err.IntegrityError as e:
+            if "Duplicate entry" in str(e):
+                conn.rollback()
+                self.log.warning(f"批量插入遇到重复,开始逐条插入。错误: {e}")
+                rowcount = 0
+                for args in args_list:
+                    try:
+                        self.insert_one(query, args)
+                        rowcount += 1
+                    except pymysql.err.IntegrityError as e2:
+                        if "Duplicate entry" in str(e2):
+                            self.log.debug(f"跳过重复条目: {e2}")
+                        else:
+                            self.log.error(f"插入失败: {e2}")
+                    except Exception as e2:
+                        self.log.error(f"插入失败: {e2}")
+                self.log.info(f"逐条插入完成: {rowcount}/{len(args_list)}条")
+            else:
+                conn.rollback()
+                self.log.exception(f"数据库完整性错误: {e}")
+                raise e
         except Exception as e:
             conn.rollback()
-            self.log.error(f"Batch insertion failed after 5 attempts. Trying single inserts. Error: {e}")
-            # 如果批量插入失败,则逐条插入
-            rowcount = 0
-            for args in args_list:
-                self.insert_one(query, args)
-                rowcount += 1
-            self.log.debug(f"Batch insertion failed. Inserted {rowcount} rows individually.")
+            self.log.exception(f"批量插入失败: {e}")
+            raise e
         finally:
             if cursor:
                 cursor.close()
             if conn:
                 conn.close()
 
-    def insert_one_or_dict(self, table=None, data=None, query=None, args=None, commit=True):
+    def insert_one_or_dict(self, table=None, data=None, query=None, args=None, commit=True, ignore=False):
         """
         单条插入(支持字典或原始SQL)
         :param table: 表名(字典插入时必需)
@@ -137,6 +153,7 @@ class MySQLConnectionPool:
         :param query: 直接SQL语句(与data二选一)
         :param args: SQL参数(query使用时必需)
         :param commit: 是否自动提交
+        :param ignore: 是否使用ignore
         :return: 最后插入ID
         """
         if data is not None:
@@ -145,17 +162,32 @@ class MySQLConnectionPool:
 
             keys = ', '.join([self._safe_identifier(k) for k in data.keys()])
             values = ', '.join(['%s'] * len(data))
-            query = f"INSERT INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
+
+            # 构建 INSERT IGNORE 语句
+            ignore_clause = "IGNORE" if ignore else ""
+            query = f"INSERT {ignore_clause} INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
             args = tuple(data.values())
         elif query is None:
             raise ValueError("Either data or query must be provided")
 
-        cursor = self._execute(query, args, commit)
-        self.log.info(f"sql insert_one_or_dict, Table: {table}, Rows: {cursor.rowcount}")
-        self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data insert_one_or_dict 入库中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
-        return cursor.lastrowid
+        try:
+            cursor = self._execute(query, args, commit)
+            self.log.info(f"sql insert_one_or_dict, Table: {table}, Rows: {cursor.rowcount}")
+            self.log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>data insert_one_or_dict 入库中>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
+            return cursor.lastrowid
+        except pymysql.err.IntegrityError as e:
+            if "Duplicate entry" in str(e):
+                self.log.warning(f"插入失败:重复条目,已跳过。错误详情: {e}")
+                return -1  # 返回 -1 表示重复条目被跳过
+            else:
+                self.log.exception(f"数据库完整性错误: {e}")
+                raise
+        except Exception as e:
+            self.log.exception(f"未知错误: {e}")
+            raise
 
-    def insert_many(self, table=None, data_list=None, query=None, args_list=None, batch_size=500, commit=True):
+    def insert_many(self, table=None, data_list=None, query=None, args_list=None, batch_size=1000, commit=True,
+                    ignore=False):
         """
         批量插入(支持字典列表或原始SQL)
         :param table: 表名(字典插入时必需)
@@ -164,6 +196,7 @@ class MySQLConnectionPool:
         :param args_list: SQL参数列表(query使用时必需)
         :param batch_size: 分批大小
         :param commit: 是否自动提交
+        :param ignore: 是否使用ignore
         :return: 影响行数
         """
         if data_list is not None:
@@ -172,7 +205,10 @@ class MySQLConnectionPool:
 
             keys = ', '.join([self._safe_identifier(k) for k in data_list[0].keys()])
             values = ', '.join(['%s'] * len(data_list[0]))
-            query = f"INSERT INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
+
+            # 构建 INSERT IGNORE 语句
+            ignore_clause = "IGNORE" if ignore else ""
+            query = f"INSERT {ignore_clause} INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
             args_list = [tuple(d.values()) for d in data_list]
         elif query is None:
             raise ValueError("Either data_list or query must be provided")
@@ -187,48 +223,71 @@ class MySQLConnectionPool:
                         if commit:
                             conn.commit()
                         total += cursor.rowcount
-            except pymysql.Error as e:
+            except pymysql.err.IntegrityError as e:
+                # 处理唯一索引冲突
                 if "Duplicate entry" in str(e):
-                    # self.log.warning(f"检测到重复条目,开始逐条插入。错误详情: {e}")
-                    raise  e
-                    # rowcount = 0
-                    # for args in batch:
-                    #     try:
-                    #         self.insert_one_or_dict(table=table, data=dict(zip(data_list[0].keys(), args)),
-                    #                                 commit=commit)
-                    #         rowcount += 1
-                    #     except pymysql.err.IntegrityError as e2:
-                    #         if "Duplicate entry" in str(e2):
-                    #             self.log.warning(f"跳过重复条目: {args}")
-                    #         else:
-                    #             self.log.error(f"插入失败: {e2}, 参数: {args}")
-                    # total += rowcount
+                    if ignore:
+                        # 如果使用了 INSERT IGNORE,理论上不会进这里,但以防万一
+                        self.log.warning(f"批量插入遇到重复条目(ignore模式): {e}")
+                    else:
+                        # 没有使用 IGNORE,降级为逐条插入
+                        self.log.warning(f"批量插入遇到重复条目,开始逐条插入。错误: {e}")
+                        if commit:
+                            conn.rollback()
+                        
+                        rowcount = 0
+                        for j, args in enumerate(batch):
+                            try:
+                                if data_list:
+                                    # 字典模式
+                                    self.insert_one_or_dict(
+                                        table=table,
+                                        data=dict(zip(data_list[0].keys(), args)),
+                                        commit=commit,
+                                        ignore=False  # 单条插入时手动捕获重复
+                                    )
+                                else:
+                                    # 原始SQL模式
+                                    self.insert_one(query, args)
+                                rowcount += 1
+                            except pymysql.err.IntegrityError as e2:
+                                if "Duplicate entry" in str(e2):
+                                    self.log.debug(f"跳过重复条目[{i+j+1}]: {e2}")
+                                else:
+                                    self.log.error(f"插入失败[{i+j+1}]: {e2}")
+                            except Exception as e2:
+                                self.log.error(f"插入失败[{i+j+1}]: {e2}")
+                        total += rowcount
+                        self.log.info(f"批次逐条插入完成: 成功{rowcount}/{len(batch)}条")
                 else:
-                    self.log.error(f"数据库错误: {e}")
+                    # 其他完整性错误
+                    self.log.exception(f"数据库完整性错误: {e}")
                     if commit:
                         conn.rollback()
                     raise e
-                # 重新抛出异常,供外部捕获
-                # 降级为单条插入
-                # for args in batch:
-                #     try:
-                #         self.insert_one_or_dict(table=None, query=query, args=args, commit=commit)
-                #         total += 1
-                #     except Exception as e2:
-                #         self.log.error(f"Single insert failed: {e2}")
-                        # continue
-        self.log.info(f"sql insert_many, Table: {table}, Total Rows: {total}")
+            except Exception as e:
+                # 其他数据库错误
+                self.log.exception(f"批量插入失败: {e}")
+                if commit:
+                    conn.rollback()
+                raise e
+        if table:
+            self.log.info(f"sql insert_many, Table: {table}, Total Rows: {total}")
+        else:
+            self.log.info(f"sql insert_many, Query: {query}, Total Rows: {total}")
         return total
 
-    def insert_many_two(self, table=None, data_list=None, query=None, args_list=None, batch_size=500, commit=True):
+    def insert_many_two(self, table=None, data_list=None, query=None, args_list=None, batch_size=1000, commit=True,
+                        ignore=False):
         """
-        批量插入(支持字典列表或原始SQL)
-        :param table: 表名(字典插入时必需)
+        批量插入(支持字典列表或原始SQL) - 备用方法
+        :param table: 表名(字典插入时必需)
         :param data_list: 字典列表 [{列名: 值}]
-        :param query: 直接SQL语句(与data_list二选一)
-        :param args_list: SQL参数列表(query使用时必需)
+        :param query: 直接SQL语句(与data_list二选一)
+        :param args_list: SQL参数列表(query使用时必需)
         :param batch_size: 分批大小
         :param commit: 是否自动提交
+        :param ignore: 是否使用INSERT IGNORE
         :return: 影响行数
         """
         if data_list is not None:
@@ -236,41 +295,51 @@ class MySQLConnectionPool:
                 raise ValueError("Data_list must be a non-empty list of dictionaries")
             keys = ', '.join([self._safe_identifier(k) for k in data_list[0].keys()])
             values = ', '.join(['%s'] * len(data_list[0]))
-            query = f"INSERT INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
+            ignore_clause = "IGNORE" if ignore else ""
+            query = f"INSERT {ignore_clause} INTO {self._safe_identifier(table)} ({keys}) VALUES ({values})"
             args_list = [tuple(d.values()) for d in data_list]
         elif query is None:
             raise ValueError("Either data_list or query must be provided")
-
+    
         total = 0
         for i in range(0, len(args_list), batch_size):
             batch = args_list[i:i + batch_size]
             try:
                 with self.pool.connection() as conn:
                     with conn.cursor() as cursor:
-                        # 添加调试日志:输出 SQL 和参数示例
-                        # self.log.debug(f"Batch insert SQL: {query}")
-                        # self.log.debug(f"Sample args: {batch[0] if batch else 'None'}")
                         cursor.executemany(query, batch)
                         if commit:
                             conn.commit()
                         total += cursor.rowcount
-                        # self.log.debug(f"Batch insert succeeded. Rows: {cursor.rowcount}")
-            except Exception as e:  # 明确捕获数据库异常
-                self.log.exception(f"Batch insert failed: {e}")  # 使用 exception 记录堆栈
-                self.log.error(f"Failed SQL: {query}, Args count: {len(batch)}")
+            except pymysql.err.IntegrityError as e:
+                if "Duplicate entry" in str(e) and not ignore:
+                    self.log.warning(f"批量插入遇到重复,降级为逐条插入: {e}")
+                    if commit:
+                        conn.rollback()
+                    rowcount = 0
+                    for args in batch:
+                        try:
+                            self.insert_one(query, args)
+                            rowcount += 1
+                        except pymysql.err.IntegrityError as e2:
+                            if "Duplicate entry" in str(e2):
+                                self.log.debug(f"跳过重复条目: {e2}")
+                            else:
+                                self.log.error(f"插入失败: {e2}")
+                        except Exception as e2:
+                            self.log.error(f"插入失败: {e2}")
+                    total += rowcount
+                else:
+                    self.log.exception(f"数据库完整性错误: {e}")
+                    if commit:
+                        conn.rollback()
+                    raise e
+            except Exception as e:
+                self.log.exception(f"批量插入失败: {e}")
                 if commit:
                     conn.rollback()
-                # 降级为单条插入,并记录每个错误
-                rowcount = 0
-                for args in batch:
-                    try:
-                        self.insert_one(query, args)
-                        rowcount += 1
-                    except Exception as e2:
-                        self.log.error(f"Single insert failed: {e2}, Args: {args}")
-                total += rowcount
-                self.log.debug(f"Inserted {rowcount}/{len(batch)} rows individually.")
-        self.log.info(f"sql insert_many, Table: {table}, Total Rows: {total}")
+                raise e
+        self.log.info(f"sql insert_many_two, Table: {table}, Total Rows: {total}")
         return total
 
     def insert_too_many(self, query, args_list, batch_size=1000):
@@ -280,6 +349,7 @@ class MySQLConnectionPool:
         :param args_list: 插入参数列表
         :param batch_size: 每次插入的条数
         """
+        self.log.info(f"sql insert_too_many, Query: {query}, Total Rows: {len(args_list)}")
         for i in range(0, len(args_list), batch_size):
             batch = args_list[i:i + batch_size]
             try:
@@ -287,6 +357,7 @@ class MySQLConnectionPool:
                     with conn.cursor() as cursor:
                         cursor.executemany(query, batch)
                         conn.commit()
+                        self.log.debug(f"insert_too_many -> Total Rows: {len(batch)}")
             except Exception as e:
                 self.log.error(f"insert_too_many error. Trying single insert. Error: {e}")
                 # 当前批次降级为单条插入
@@ -523,9 +594,32 @@ class MySQLConnectionPool:
             self.log.error(f"Connection pool health check failed: {e}")
             return False
 
+    def close(self):
+        """
+        关闭连接池,释放所有连接
+        """
+        try:
+            if hasattr(self, 'pool') and self.pool:
+                self.pool.close()
+                self.log.info("数据库连接池已关闭")
+        except Exception as e:
+            self.log.error(f"关闭连接池失败: {e}")
+
     @staticmethod
     def _safe_identifier(name):
         """SQL标识符安全校验"""
         if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
             raise ValueError(f"Invalid SQL identifier: {name}")
         return name
+
+
+if __name__ == '__main__':
+    sql_pool = MySQLConnectionPool()
+    data_dic = {'card_type_id': 111, 'card_type_name': '补充包 继承的意志【OPC-13】', 'card_type_position': 964,
+                'card_id': 5284, 'card_name': '蒙奇·D·路飞', 'card_number': 'OP13-001', 'card_rarity': 'L',
+                'card_img': 'https://source.windoent.com/OnePiecePc/Picture/1757929283612OP13-001.png',
+                'card_life': '4', 'card_attribute': '打', 'card_power': '5000', 'card_attack': '-',
+                'card_color': '红/绿', 'subscript': 4, 'card_features': '超新星/草帽一伙',
+                'card_text_desc': '【咚!!×1】【对方的攻击时】我方处于活跃状态的咚!!不多于5张的场合,可以将我方任意张数的咚!!转为休息状态。每有1张转为休息状态的咚!!,本次战斗中,此领袖或我方最多1张拥有《草帽一伙》特征的角色力量+2000。',
+                'card_offer_type': '补充包 继承的意志【OPC-13】', 'crawler_language': '简中'}
+    sql_pool.insert_one_or_dict(table="one_piece_record", data=data_dic)