فهرست منبع

refactor(mercari_spider): 优化爬虫数据获取与处理逻辑

- 引入DPoP生成器动态生成请求令牌,替代硬编码的TOKEN
- 修改代理获取逻辑,更新为已购买账户代理配置
- 移除商品详情页请求逻辑,改为仅爬取商品列表及其主图
- 将单条数据插入改为批量插入,提高数据库写入效率
- 调整请求重试等待时间由1秒改为2秒,增加稳定性
- 扩展分页爬取范围,将最大页数增加至200页
- 注释并暂不启用每日定时任务主函数调用
charley 3 هفته پیش
والد
کامیت
6d2657edde
1فایلهای تغییر یافته به همراه43 افزوده شده و 82 حذف شده
  1. 43 82
      mercari_spider/mercari_jp_spider.py

+ 43 - 82
mercari_spider/mercari_jp_spider.py

@@ -7,20 +7,24 @@ import inspect
 import schedule
 import requests
 from loguru import logger
+
+from dpop_generator import DPoPGenerator
 from mysql_pool import MySQLConnectionPool
 from tenacity import retry, stop_after_attempt, wait_fixed
 
 """
 请求网址:
 https://jp.mercari.com/search?category_id=82&page_token=v1%3A1&status=sold_out%7Ctrading
+
+20260514 -> 只拿列表页  正面图
 """
 
 SEARCH_URL = "https://api.mercari.jp/v2/entities:search"
 PAGE_SIZE = 120
 LAPLACE_DEVICE_UUID = "a00429c5-ad26-4be4-83ae-60b7239e14d5"
 SEARCH_SESSION_ID = "cfba38acec8cae78136c62441bbb267a"
-LIST_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYyMTksImp0aSI6IjQ0YmM4MzZlLWFiYWEtNDI1OC1hMjQ4LTNlNjkxMTUzZjY2NSIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvdjIvZW50aXRpZXM6c2VhcmNoIiwiaHRtIjoiUE9TVCIsInV1aWQiOiJhMDA0MjljNS1hZDI2LTRiZTQtODNhZS02MGI3MjM5ZTE0ZDUifQ.KqYWvIC42NYjNTewIfttuPMFHYAwJ4JZIXn4ulQye6s9c5zQutabWoOp8sKDjy-zvmbDCYA-6K7e7dW3bVu3cw"
-DETAIL_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYwMDksImp0aSI6IjFmNGYwNDlhLTdmMGYtNGM0Zi1hZjcxLTIwYmFhZDhhMTc4NCIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvaXRlbXMvZ2V0IiwiaHRtIjoiR0VUIiwidXVpZCI6ImEwMDQyOWM1LWFkMjYtNGJlNC04M2FlLTYwYjcyMzllMTRkNSJ9._92fashFF1PmC0Ol0HFqz9rIYdzL-w_ZJwXXRTI3zX_8oNP_ziNUIwySB50Itgp88vsgy8skp4DZ2DTd3WBWnQ"
+# LIST_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYyMTksImp0aSI6IjQ0YmM4MzZlLWFiYWEtNDI1OC1hMjQ4LTNlNjkxMTUzZjY2NSIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvdjIvZW50aXRpZXM6c2VhcmNoIiwiaHRtIjoiUE9TVCIsInV1aWQiOiJhMDA0MjljNS1hZDI2LTRiZTQtODNhZS02MGI3MjM5ZTE0ZDUifQ.KqYWvIC42NYjNTewIfttuPMFHYAwJ4JZIXn4ulQye6s9c5zQutabWoOp8sKDjy-zvmbDCYA-6K7e7dW3bVu3cw"
+# DETAIL_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYwMDksImp0aSI6IjFmNGYwNDlhLTdmMGYtNGM0Zi1hZjcxLTIwYmFhZDhhMTc4NCIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvaXRlbXMvZ2V0IiwiaHRtIjoiR0VUIiwidXVpZCI6ImEwMDQyOWM1LWFkMjYtNGJlNC04M2FlLTYwYjcyMzllMTRkNSJ9._92fashFF1PmC0Ol0HFqz9rIYdzL-w_ZJwXXRTI3zX_8oNP_ziNUIwySB50Itgp88vsgy8skp4DZ2DTd3WBWnQ"
 
 logger.remove()
 logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
@@ -46,21 +50,20 @@ def after_log(retry_state):
         log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
 
 
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
 def get_proxys(log):
-    """
-    获取代理
-    :return: 代理
-    """
-    tunnel = "x371.kdltps.com:15818"
-    kdl_username = "t13753103189895"
-    kdl_password = "o0yefv6z"
+    # 已购买账户  北美
+    # http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
+    # https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
+    http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
+    https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
+
     try:
-        proxies = {
-            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
-            "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
+        proxySettings = {
+            "http": http_proxy,
+            "https": https_proxy,
         }
-        return proxies
+        return proxySettings
     except Exception as e:
         log.error(f"Error getting proxy: {e}")
         raise e
@@ -68,11 +71,15 @@ def get_proxys(log):
 
 def build_headers() -> dict:
     """构造 Mercari 搜索接口请求头。"""
+    gen = DPoPGenerator()
+    list_token = gen.generate(
+        htu="https://api.mercari.jp/v2/entities:search", htm="POST"
+    )
     return {
         "accept": "application/json, text/plain, */*",
         "accept-language": "ja",
         "content-type": "application/json",
-        "dpop": LIST_DPOP,
+        "dpop": list_token,
         # "dpop": "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYyMTksImp0aSI6IjQ0YmM4MzZlLWFiYWEtNDI1OC1hMjQ4LTNlNjkxMTUzZjY2NSIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvdjIvZW50aXRpZXM6c2VhcmNoIiwiaHRtIjoiUE9TVCIsInV1aWQiOiJhMDA0MjljNS1hZDI2LTRiZTQtODNhZS02MGI3MjM5ZTE0ZDUifQ.KqYWvIC42NYjNTewIfttuPMFHYAwJ4JZIXn4ulQye6s9c5zQutabWoOp8sKDjy-zvmbDCYA-6K7e7dW3bVu3cw",
         "origin": "https://jp.mercari.com",
         "priority": "u=1, i",
@@ -187,53 +194,13 @@ def fetch_page(
         SEARCH_URL,
         headers=build_headers(),
         json=build_payload(page_token=page_token, category_id=category_id),
-        timeout=timeout
+        timeout=timeout,
+        # proxies=get_proxys(log),
     )
     response.raise_for_status()
     return response
 
 
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
-def get_detail_page(log, pid):
-    """
-    获取商品详情。
-    :param log: logger对象
-    :param pid: 商品ID
-    """
-    log.info(f"获取商品详情 {pid}............")
-    headers = {
-        "accept": "application/json, text/plain, */*",
-        # "accept-language": "ja",
-        "dpop": DETAIL_DPOP,
-        # "referer": "https://jp.mercari.com/",
-        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36",
-        "x-platform": "web"
-    }
-    url = "https://api.mercari.jp/items/get"
-    params = {
-        # "id": "m69042262006",
-        "id": pid,
-        "include_item_attributes": "true",
-        "include_product_page_component": "true",
-        "include_non_ui_item_attributes": "true",
-        "include_donation": "true",
-        "include_item_attributes_sections": "true",
-        "include_auction": "true",
-        "country_code": "JP"
-    }
-    response = requests.get(url, headers=headers, params=params, timeout=22)
-    response.raise_for_status()
-    resp_json = response.json()
-    data = resp_json.get("data", {})
-    tag_seller = data.get("seller", {})
-    seller_id = tag_seller.get("id")
-    seller_name = tag_seller.get("name")
-    photos = data.get("photos", [])
-    photos = ','.join(photos) if photos else None
-    # print(seller_id, seller_name, photos)
-    return seller_id, seller_name, photos
-
-
 def parse_list(log, resp_json, sql_pool, category_id, category_name):
     """
     解析商品列表数据。
@@ -243,36 +210,30 @@ def parse_list(log, resp_json, sql_pool, category_id, category_name):
     :param category_id: 类别ID
     :param category_name: 类别名称
     """
+    log.info(f"解析商品列表数据............")
     items = resp_json.get("items", [])
+
+    data_list = []
     for item in items:
         pid = item.get("id")
-        # sellerId = item.get("sellerId")
+        seller_id = item.get("sellerId")
         status = item.get("status")
         product_name = item.get("name")
         price = item.get("price")
 
         created_at = item.get("created")  # 1777512645  时间戳
-        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(created_at))) if int(created_at) else None
+        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(created_at))) if created_at else None
         updated_at = item.get("updated")  # 1777512645  时间戳
-        updated_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(updated_at))) if int(updated_at) else None
+        updated_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(updated_at))) if updated_at else None
 
-        # thumbnails = item.get("thumbnails", [])
-        # img = thumbnails[0] if thumbnails else None
-
-        # categoryId = item.get("categoryId")
-
-        # 获取详情页多图
-        try:
-            seller_id, seller_name, photos = get_detail_page(log, pid)
-        except Exception as e:
-            log.error(f"Error getting detail page: {e}")
-            seller_id, seller_name, photos = None, None, None
+        thumbnails = item.get("thumbnails", [])
+        img = thumbnails[0] if thumbnails else None
 
         data_dict = {
             "pid": pid,
             "seller_id": seller_id,
-            "seller_name": seller_name,
-            "photos": photos,
+            # "seller_name": seller_name,
+            "photos": img,
             "status": status,
             "product_name": product_name,
             "price": price,
@@ -282,8 +243,11 @@ def parse_list(log, resp_json, sql_pool, category_id, category_name):
             "category_name": category_name
         }
         # log.info(data_dict)
+        data_list.append(data_dict)
 
-        sql_pool.insert_one_or_dict(table="mercari_record", data=data_dict, ignore=True)
+    # 批量插入数据
+    if data_list:
+        sql_pool.insert_many(table="mercari_record", data_list=data_list, ignore=True)
 
 
 def iter_pages(
@@ -291,8 +255,8 @@ def iter_pages(
         sql_pool,
         category_id: int,
         category_name: str,
-        start_page: int = 1,
-        end_page: int = 15000,
+        start_page: int,
+        end_page: int,
 ):
     """
     循环请求多页,返回页码和 Response。
@@ -303,9 +267,6 @@ def iter_pages(
     :param start_page: 开始页码
     :param end_page: 结束页码
     """
-    if category_id == 1289:
-        start_page = 42
-
     if end_page < start_page:
         raise ValueError("end_page 必须大于等于 start_page")
 
@@ -360,7 +321,7 @@ def mercari_main(log):
                 category_id = category["category_id"]
                 category_name = category["category_name"]
                 log.debug(f'开始爬取类别 {category_name}............')
-                iter_pages(log, sql_pool, category_id, category_name, start_page=1, end_page=100)
+                iter_pages(log, sql_pool, category_id, category_name, start_page=1, end_page=200)
             except Exception as e:
                 log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
 
@@ -374,7 +335,7 @@ def schedule_task():
     """
     设置定时任务
     """
-    mercari_main(log=logger)
+    # mercari_main(log=logger)
 
     schedule.every().day.at("04:00").do(mercari_main, log=logger)
     while True:
@@ -383,6 +344,6 @@ def schedule_task():
 
 
 if __name__ == "__main__":
-    # mercari_main(log=logger)
     # get_detail_page(logger, "m69042262006")
+    # mercari_main(log=logger)
     schedule_task()