| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- # -*- coding: utf-8 -*-
- # Author : Charley
- # Python : 3.10.8
- # Date : 2026/4/15 15:10
- import time
- import inspect
- import requests
- import schedule
- import user_agent
- from loguru import logger
- from mysql_pool import MySQLConnectionPool
- from tenacity import retry, stop_after_attempt, wait_fixed
- logger.remove()
- logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
- format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
- level="DEBUG", retention="7 day")
- cookie = "ENSID=MTc3NjMxMDE1MXw1LThLaEQ5OVRZTDBLbktHUzhpTG9PR01ybUQwYmxyb2dqMlJETVpUN3dCVVV3aWpwbXE0UGtDYXkwN2ZlUU9ib3I2a3ZCa21xZGdDQjc2aVBVNElHWDlXNXlPUS1YZVd816ifsPbu7_r0ouHJxFdsRYT3jsCwBFEZ6IMhzHnGcyI=; _pin_unauth=dWlkPU5qVXlOak16TlRJdE1UVmxNUzAwTUdZNExXRmpZakl0WlRJMU1EazBaVEl6WVRRMw; _tt_enable_cookie=1; _ttp=01KP7TC2C47N8816E74NZ2S87G_.tt.1; aws-waf-token=85ea0abd-c7e7-44fa-a486-721005517367:BgoAddgnTIMdAAAA:sgUJ0isHGRMm9HGMWaserzc0yH/cfmcnAJs7tApXkvxu8CkSB2W2/+vEB9V4uBUqE+8uegKHQINRE2ExEMC9XRl6QLHoC16s5mOsvrptUYDuWqAnyQJcr8a6dAlUpokqmdLFzLRoiM2digCAKXmKRM5fbEQgY56lCzRpNqolUtcS/X9zZQIfJnj2GfmAjNw=; _gcl_au=1.1.140783051.1776231515.1213660920.1776231619.1776231619; _ga_T9G4FWRKGP=GS2.1.s1776309189$o1$g0$t1776309189$j60$l0$h0; _gid=GA1.2.339334579.1776309190; __lt__cid=074b5327-9f75-4356-a201-9879abb859a5; __lt__sid=1cd194ae-b7f570f4; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22undefined%22%2C%22expiryDate%22%3A%222027-04-16T03%3A13%3A11.377Z%22%7D; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%221CbwgL9cLbT9BLfXJtkK%22%2C%22expiryDate%22%3A%222027-04-16T03%3A13%3A11.379Z%22%7D; _ga_WLFPCJHLHL=GS2.1.s1776309194$o1$g0$t1776309194$j60$l0$h0; _ga=GA1.1.342828207.1776231516; forterToken=6f95dbdd5df5486fb9bfe93652a6b6a7_1776309188237__UDF43-m4_27ck_; ttcsid_CEM1KGBC77U8BHMFF6SG=1776309193318::MmzC2Nw5ahbgUD_ovmJa.1.1776309203360.1; _dd_s=aid=c68687bf-9ca5-4040-9266-a9c8281287b7&logs=1&id=f2443d73-de49-431a-9dba-17f30b9410ac&created=1776309188187&expire=1776310738774&rum=0; _rdt_uuid=1776231515762.27c25d2a-f2b5-4370-89ca-ba2ba6d93c35; _rdt_em=:7fa565b08bc719fc95a07f3f9cbb8cfcd715b62ce82bc26739d3074a5196870c; ttcsid_CAP79SBC77U56BB6BI50=1776309194536::zh_5-OLx-MD4DmA4jALH.4.1776310041216.1; ttcsid=1776309194523::bhCq-3lisAc3SvWnrZng.4.1776310041216.0::1.845885.846484::955506.51.1579.4652::954188.255.4300; _ga_6H1EYVVN53=GS2.1.s1776308613$o5$g1$t1776310150$j60$l0$h0; _ga_3722WCREQR=GS2.1.s1776308613$o5$g1$t1776310150$j60$l0$h0"
- headers = {
- "accept": "application/json",
- # "referer": "https://snkrdunk.com/en/trading-cards/671489?slide=right",
- "user-agent": user_agent.generate_user_agent(),
- "cookie": cookie
- }
- def after_log(retry_state):
- """
- retry 回调
- :param retry_state: RetryCallState 对象
- """
- # 检查 args 是否存在且不为空
- if retry_state.args and len(retry_state.args) > 0:
- log = retry_state.args[0] # 获取传入的 logger
- else:
- log = logger # 使用全局 logger
- if retry_state.outcome.failed:
- log.warning(
- f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
- else:
- log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
- @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
- def get_proxys(log):
- """
- 获取代理配置
- :param log: 日志对象
- :return: 代理字典
- """
- tunnel = "x371.kdltps.com:15818"
- kdl_username = "t13753103189895"
- kdl_password = "o0yefv6z"
- try:
- proxies = {
- "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
- "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
- }
- return proxies
- except Exception as e:
- log.error(f"Error getting proxy: {e}")
- raise e
- @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
- def get_single_page(log, page, brand):
- """
- 获取单页数据
- :param log: 日志对象
- :param page: 页码
- :param brand: 品牌
- :return: 数据列表
- """
- log.info(f"获取第 {page} 页数据,品牌为 {brand}....................................................")
- url = "https://snkrdunk.com/en/v1/trading-cards/used"
- params = {
- "brandId": brand,
- # "brandId": "pokemon",
- "categoryId": "25",
- # "page": "29",
- "page": page,
- "perPage": "20",
- "sortType": "latest",
- "isOnlyOnSale": "false"
- }
- response = requests.get(url, headers=headers, params=params, proxies=get_proxys(log), timeout=22)
- response.raise_for_status()
- resp_json = response.json()
- # print(resp_json)
- usedTradingCards = resp_json.get("usedTradingCards", [])
- return usedTradingCards
- def parse_data(log, resp_list, brand, sql_pool):
- """
- 解析数据
- :param log: 日志对象
- :param resp_list: 数据列表
- :param brand: 品牌
- :param sql_pool: 数据库连接池
- """
- try:
- dict_list = []
- for data in resp_list:
- used_id = data.get("id")
- tradingCardId = data.get("tradingCardId")
- card_name = data.get("tradingCardName")
- listing_uid = data.get("listingUID") # 进入详情页的id
- price = data.get("price")
- price = price.replace("US $", "").replace(",", "")
- condition = data.get("condition")
- front_img = data.get("thumbnailUrl")
- # 去除图片中的大小格式 https://cdn.snkrdunk.com/apparel_used_listings/49c34dac-27d9-4b7b-96db-72a6c70a464c/5907002.jpeg?size=m
- front_img = front_img.split("?")[0]
- is_sold = data.get("isSold")
- data_dict = {
- "brand": brand,
- "used_id": used_id,
- "trading_card_id": tradingCardId,
- "card_name": card_name,
- "listing_uid": listing_uid,
- "price": price,
- "score": condition,
- "front_img": front_img,
- "is_sold": is_sold,
- "category": "Trading Cards (Single Card)"
- }
- # print(data_dict)
- dict_list.append(data_dict)
- sql_pool.insert_many(table="snkrdunk_record", data_list=dict_list, ignore=True)
- except Exception as e:
- log.error(f"Error parsing data: {e}")
- def get_list_data(log, brand, sql_pool):
- """
- 获取列表数据
- :param log: 日志对象
- :param brand: 品牌
- :param sql_pool: 数据库连接池
- """
- page = 1
- while True:
- try:
- data_list = get_single_page(log, page, brand)
- if not data_list:
- log.info(f"No more data for brand {brand}, page {page}")
- break
- except Exception as e:
- log.error(f"Error getting page {page} for brand {brand}: {e}")
- data_list = []
- parse_data(log, data_list, brand, sql_pool)
- if len(data_list) < 20:
- log.info(f"No more data for brand {brand}, page {page}")
- break
- page += 1
- @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
- def snk_main(log):
- """
- 主函数
- :param log: logger对象
- """
- log.info(
- f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
- # 配置 MySQL 连接池
- sql_pool = MySQLConnectionPool(log=log)
- if not sql_pool.check_pool_health():
- log.error("数据库连接池异常")
- raise RuntimeError("数据库连接池异常")
- brand_list = ["pokemon", "onepiece", "yu-gi-oh"]
- try:
- for brand in brand_list:
- log.info(f'开始采集 {brand} 数据....................................................')
- try:
- get_list_data(log, brand, sql_pool)
- except Exception as e:
- log.error(f'采集 {brand} 数据异常: {e}')
- except Exception as e:
- log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
- finally:
- log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
- def schedule_task():
- """
- 爬虫模块 定时任务 的启动文件
- """
- # 立即运行一次任务
- # snk_main(log=logger)
- # 设置定时任务
- schedule.every().day.at("00:01").do(snk_main, log=logger)
- while True:
- schedule.run_pending()
- time.sleep(1)
- if __name__ == '__main__':
- schedule_task()
- # snk_main(log=logger)
|