Browse Source

fix(spider): 优化请求超时及商铺数据处理逻辑

- 调整requests.post请求的timeout参数,提升响应等待控制
- 增加无数据时日志记录,并将对应店铺标记为注销状态
- 查询商户时增加is_deleted判断,避免访问已注销店铺
- 注释掉主程序中立即运行的任务调用,避免重复触发
- 优化代码格式与注释,提升代码可读性和维护性
charley 1 month ago
parent
commit
1bd2d89f4e
1 changed files with 13 additions and 6 deletions
  1. 13 6
      zc_spider/zc_new_daily_spider.py

+ 13 - 6
zc_spider/zc_new_daily_spider.py

@@ -22,7 +22,7 @@ BASE_URL = "https://cashier.yqszpay.com"
 PAGE_SIZE = 10
 
 headers = {
-    "User-Agent": user_agent.generate_user_agent(os="android"), # 设置为安卓模拟器
+    "User-Agent": user_agent.generate_user_agent(os="android"),  # 设置为安卓模拟器
     "Connection": "Keep-Alive",
     "Accept-Encoding": "gzip",
     "Content-Type": "application/json",
@@ -93,7 +93,8 @@ def make_encrypted_post_request(log, url: str, request_data: dict, extra_headers
     encrypted_body = CryptoHelper.encrypt_request_data(request_data)
     # print(request_headers)
     # response = requests.post(url, headers=request_headers, json=encrypted_body, timeout=22, proxies=get_proxys(log))
-    response = requests.post(url, headers=request_headers, json=encrypted_body, timeout=(5, 30), proxies=get_proxys(log))
+    response = requests.post(url, headers=request_headers, json=encrypted_body, timeout=(5, 30),
+                             proxies=get_proxys(log))
     # response = requests.post(url, headers=request_headers, json=encrypted_body, timeout=(5, 30))
     # response.raise_for_status()
 
@@ -390,6 +391,12 @@ def get_sold_list(log, shop_id, token, sql_pool, shop_name):
             break
 
         data_list = result.get('rows', [])
+        if not data_list:
+            log.info(f"第 {page_num} 页无数据,停止翻页")
+            log.info(f'该店铺{shop_name}无数据, 修改为店铺注销状态..........')
+            # 更新店铺状态为注销状态
+            sql_pool.update_one("UPDATE zc_shop_record SET is_deleted = 1 WHERE shop_id = %s", (shop_id,))
+            break
         parse_sold_data(log, token, data_list, sql_pool, shop_name)
 
         # 检查是否有数据
@@ -478,8 +485,9 @@ def zc_main(log):
 
         # 获取sold data - 遍历所有商户
         try:
-            # 从 shop 表查询所有 merNo
-            mer_no_rows = sql_pool.select_all("SELECT shop_id, shop_name FROM zc_shop_record WHERE sold_number != 0")
+            # 从 shop 表查询所有 merNo  2026/5/9增加is_deleted判断是否注销店铺
+            mer_no_rows = sql_pool.select_all(
+                "SELECT shop_id, shop_name FROM zc_shop_record WHERE is_deleted = 0 AND sold_number != 0")
             log.info(f"查询到 {len(mer_no_rows)} 个商户编号: {mer_no_rows}")
             for shop_id, shop_name in mer_no_rows:
                 log.info(f"开始爬取商户 {shop_id}, {shop_name} 的商品数据")
@@ -534,7 +542,7 @@ def schedule_task():
     爬虫模块 定时任务 的启动文件
     """
     # 立即运行一次任务
-    zc_main(log=logger)
+    # zc_main(log=logger)
 
     # 设置定时任务
     schedule.every().day.at("00:01").do(zc_main, log=logger)
@@ -544,7 +552,6 @@ def schedule_task():
         time.sleep(1)
 
 
-
 if __name__ == '__main__':
     # zc_main(logger)
     schedule_task()