Bladeren bron

refactor(spider): 重构宝可梦爬虫代码结构

- 将重复的headers定义提取到全局变量
- 移除函数内部重复的headers创建逻辑
- 添加通用的get_set_list函数用于获取卡片系列列表
- 重构JP和US版本的主函数以动态获取系列列表
- 统一HTTP请求的headers使用方式
- 优化代码复用性和可维护性
charley 3 weken geleden
bovenliggende
commit
6bcdc5387f
2 gewijzigde bestanden met toevoegingen van 186 en 15 verwijderingen
  1. 94 8
      pokemon_tcg_spider/tcg_jp_pokemon_spider.py
  2. 92 7
      pokemon_tcg_spider/tcg_us_pokemon_spider.py

+ 94 - 8
pokemon_tcg_spider/tcg_jp_pokemon_spider.py

@@ -17,6 +17,13 @@ logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
            format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
            level="DEBUG", retention="7 day")
 
+headers = {
+    "accept": "application/json, text/plain, */*",
+    "content-type": "application/json",
+    "referer": "https://www.tcgplayer.com/",
+    "user-agent": user_agent.generate_user_agent()
+}
+
 
 def after_log(retry_state):
     """
@@ -68,12 +75,6 @@ def get_single_page(log, setUrlName, setName, page, sql_pool):
     :return: 数据列表长度
     """
     log.debug(f"Getting single page: {setUrlName} -> {page}")
-    headers = {
-        "accept": "application/json, text/plain, */*",
-        "content-type": "application/json",
-        "referer": "https://www.tcgplayer.com/",
-        "user-agent": user_agent.generate_user_agent()
-    }
 
     url = "https://mp-search-api.tcgplayer.com/v1/search/request"
     params = {
@@ -130,7 +131,7 @@ def get_single_page(log, setUrlName, setName, page, sql_pool):
         },
         "sort": {}
     }
-    response = requests.post(url, headers=headers, params=params, json=data,timeout=22)
+    response = requests.post(url, headers=headers, params=params, json=data, timeout=22)
     # print(response.json())
     response.raise_for_status()
 
@@ -214,6 +215,91 @@ def get_list_data(log, setUrlName, setName, sql_pool):
         page += 1
 
 
+# ------------------------------------------------------------------------------------------------------
+def get_set_list(log):
+    """
+    获取单页数据
+    :param log: logger对象
+    :return: 数据列表
+    """
+    log.debug(f"Getting set list page")
+    jp_set_name_list = []
+
+    url = "https://mp-search-api.tcgplayer.com/v1/search/request"
+    params = {
+        "q": "",
+        "isList": "false",
+        # "mpfev": "4967"
+    }
+    data = {
+        "algorithm": "sales_dismax",
+        "from": 0,
+        "size": 24,
+        "filters": {
+            "term": {
+                "productLineName": [
+                    "pokemon-japan"
+                ]
+            },
+            "range": {},
+            "match": {}
+        },
+        "listingSearch": {
+            "context": {
+                "cart": {
+                    "packages": {}
+                }
+            },
+            "filters": {
+                "term": {
+                    "sellerStatus": "Live",
+                    "channelId": 0
+                },
+                "range": {
+                    "quantity": {
+                        "gte": 1
+                    }
+                },
+                "exclude": {
+                    "channelExclusion": 0
+                }
+            }
+        },
+        "context": {
+            "cart": {},
+            "shippingCountry": "HK",
+            "userProfile": {}
+        },
+        "settings": {
+            "useFuzzySearch": True,
+            "didYouMean": {}
+        },
+        "sort": {}
+    }
+    # data = json.dumps(data, separators=(',', ':'))
+    try:
+        response = requests.post(url, headers=headers, params=params, json=data)
+        resp_json = response.json()
+
+        results = resp_json.get('results', [])[0] if resp_json.get('results', []) else {}
+        set_name_list = results.get('aggregations', {}).get('setName', [])
+
+        for st in set_name_list:
+            set_name = st.get('value')
+            url_value = st.get('urlValue')
+            data_dict = {
+                "set_name": set_name,
+                "url_value": url_value
+            }
+            # print(data_dict)
+            jp_set_name_list.append(data_dict)
+    except Exception as e:
+        log.error(f"Error getting set list: {e}")
+
+    # print(jp_set_name_list)
+    return jp_set_name_list
+
+
 @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
 def jp_pokemon_main(log):
     """
@@ -230,7 +316,7 @@ def jp_pokemon_main(log):
     try:
         # 获取分类列表
         log.debug(".......... 获取分类列表 ..........")
-        for d_dict in SET_JP_NAME_LIST:
+        for d_dict in get_set_list(log):
             setUrlName = d_dict.get("url_value")
             setName = d_dict.get("set_name")
             try:

+ 92 - 7
pokemon_tcg_spider/tcg_us_pokemon_spider.py

@@ -17,6 +17,13 @@ logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
            format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
            level="DEBUG", retention="7 day")
 
+headers = {
+    "accept": "application/json, text/plain, */*",
+    "content-type": "application/json",
+    "referer": "https://www.tcgplayer.com/",
+    "user-agent": user_agent.generate_user_agent()
+}
+
 
 def after_log(retry_state):
     """
@@ -68,12 +75,6 @@ def get_single_page(log, setUrlName, setName, page, sql_pool):
     :return: 数据列表长度
     """
     log.debug(f"Getting single page: {setUrlName} -> {page}")
-    headers = {
-        "accept": "application/json, text/plain, */*",
-        "content-type": "application/json",
-        "referer": "https://www.tcgplayer.com/",
-        "user-agent": user_agent.generate_user_agent()
-    }
 
     url = "https://mp-search-api.tcgplayer.com/v1/search/request"
     params = {
@@ -216,6 +217,90 @@ def get_list_data(log, setUrlName, setName, sql_pool):
         page += 1
 
 
+# ----------------------------------------------------------------------------------------------------------------------
+def get_set_list(log):
+    """
+    获取单页数据
+    :param log: logger对象
+    :return: 数据列表
+    """
+    log.debug(f"Getting set list page")
+    jp_set_name_list = []
+
+    url = "https://mp-search-api.tcgplayer.com/v1/search/request"
+    params = {
+        "q": "",
+        "isList": "false",
+        # "mpfev": "4967"
+    }
+    data = {
+        "algorithm": "sales_dismax",
+        "from": 0,
+        "size": 24,
+        "filters": {
+            "term": {
+                "productLineName": [
+                    "pokemon"
+                ]
+            },
+            "range": {},
+            "match": {}
+        },
+        "listingSearch": {
+            "context": {
+                "cart": {
+                    "packages": {}
+                }
+            },
+            "filters": {
+                "term": {
+                    "sellerStatus": "Live",
+                    "channelId": 0
+                },
+                "range": {
+                    "quantity": {
+                        "gte": 1
+                    }
+                },
+                "exclude": {
+                    "channelExclusion": 0
+                }
+            }
+        },
+        "context": {
+            "cart": {},
+            "shippingCountry": "HK",
+            "userProfile": {}
+        },
+        "settings": {
+            "useFuzzySearch": True,
+            "didYouMean": {}
+        },
+        "sort": {}
+    }
+    # data = json.dumps(data, separators=(',', ':'))
+    try:
+        response = requests.post(url, headers=headers, params=params, json=data)
+        resp_json = response.json()
+
+        results = resp_json.get('results', [])[0] if resp_json.get('results', []) else {}
+        set_name_list = results.get('aggregations', {}).get('setName', [])
+
+        for st in set_name_list:
+            set_name = st.get('value')
+            url_value = st.get('urlValue')
+            data_dict = {
+                "set_name": set_name,
+                "url_value": url_value
+            }
+            # print(data_dict)
+            jp_set_name_list.append(data_dict)
+    except Exception as e:
+        log.error(f"Error getting set list: {e}")
+
+    # print(jp_set_name_list)
+    return jp_set_name_list
+
 @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
 def us_pokemon_main(log):
     """
@@ -232,7 +317,7 @@ def us_pokemon_main(log):
     try:
         # 获取分类列表
         log.debug(".......... 获取分类列表 ..........")
-        for d_dict in SET_US_NAME_LIST:
+        for d_dict in get_set_list(log):
             setUrlName = d_dict.get("url_value")
             setName = d_dict.get("set_name")
             try: