# -*- coding: utf-8 -*- # Author : Charley # Python : 3.10.8 # Date : 2025/9/15 14:53 import inspect import random import time import schedule import user_agent from curl_cffi import requests from loguru import logger from parsel import Selector from urllib.parse import urljoin from mysql_pool import MySQLConnectionPool from DrissionPage import ChromiumPage, ChromiumOptions from tenacity import retry, stop_after_attempt, wait_fixed logger.remove() logger.add("logs/pop_player_{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00", format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}", level="DEBUG", retention="3 day") client_identifier_list = [ "edge99", "edge101", "chrome99", "chrome100", "chrome101", "chrome104", "chrome107", "chrome110", "chrome116", "chrome119", "chrome120", "chrome123", "chrome124", "chrome99_android", "safari15_3", "safari15_5", "safari17_0", "safari17_2_ios" ] BASE_URL = 'https://www.psacard.com' headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 'User-Agent': user_agent.generate_user_agent() } category_link_list = { 'Baseball Cards': 'https://www.psacard.com/pop/baseball-cards/20003', 'Baseball Coins': 'https://www.psacard.com/pop/baseball-coins/82797', 'Basketball Cards': 'https://www.psacard.com/pop/basketball-cards/20019', 'Basketball Coins': 'https://www.psacard.com/pop/basketball-coins/83007', 'Boxing / Wrestling Cards / MMA': 'https://www.psacard.com/pop/boxing-wrestling-cards-mma/20021', 'Football Cards': 'https://www.psacard.com/pop/football-cards/20014', 'Football Coins': 'https://www.psacard.com/pop/football-coins/83011', 'Golf Cards': 'https://www.psacard.com/pop/golf-cards/20023', 'Hockey Cards': 'https://www.psacard.com/pop/hockey-cards/20020', 'Hockey Coins': 'https://www.psacard.com/pop/hockey-coins/83012', 'Minor League Cards': 'https://www.psacard.com/pop/minor-league-cards/20031', 'Misc Cards': 'https://www.psacard.com/pop/misc-cards/20033', 'Multi-Sport Cards': 'https://www.psacard.com/pop/multi-sport-cards/20006', 'Multi-Sport Coins': 'https://www.psacard.com/pop/multi-sport-coins/102825', 'Non-Sport Cards': 'https://www.psacard.com/pop/non-sport-cards/20032', 'Non-Sport Coins': 'https://www.psacard.com/pop/non-sport-coins/82981', 'Packs': 'https://www.psacard.com/pop/packs/20017', 'Pins': 'https://www.psacard.com/pop/pins/20013', 'Soccer Cards': 'https://www.psacard.com/pop/soccer-cards/20004', 'TCG Cards': 'https://www.psacard.com/pop/tcg-cards/156940', 'Tickets': 'https://www.psacard.com/pop/tickets/20022', # 'Game-Used Bats': 'https://www.psacard.com/pop/bats' } def after_log(retry_state): """ retry 回调 :param retry_state: RetryCallState 对象 """ # 检查 args 是否存在且不为空 if retry_state.args and len(retry_state.args) > 0: log = retry_state.args[0] # 获取传入的 logger else: log = logger # 使用全局 logger if retry_state.outcome.failed: log.warning( f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times") else: log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded") @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def get_proxys(log): # 已购买账户 北美 # http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927" # https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927" http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" try: proxySettings = { "http": http_proxy, "https": https_proxy, } return proxySettings except Exception as e: log.error(f"Error getting proxy: {e}") raise e @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log) def get_sets_data(log, category, category_id, category_link, tag_year, tag_year_link, sql_pool): # 隧道域名:端口号 # tunnel = "x371.kdltps.com:15818" tunnel = "proxy.123proxy.cn:36927" options = ChromiumOptions() # options.set_paths(local_port=9137, user_data_path=r'D:\Drissionpage_temp\topps1_port_9137') # options = ChromiumOptions() options.set_proxy("http://" + tunnel) options.auto_port(True) # options.headless(True) options.set_argument("--disable-gpu") options.set_argument("-accept-lang=en-US") options.set_argument('--start-maximized') tab = ChromiumPage(options) try: # tab_url = f"https://www.psacard.com/pop/baseball-cards/2024/260045" tab_url = tag_year_link tab.get(tab_url) log.debug(f'{inspect.currentframe().f_code.co_name} -> 页面加载成功, url: {tab_url}') # time.sleep(11111) # 循环翻页直到最后一页 page = 1 while True: log.debug(f'{inspect.currentframe().f_code.co_name} -> 当前页码: {page}') html = tab.html if not html: log.error(f'{inspect.currentframe().f_code.co_name} -> 页面加载失败...........') raise Exception('页面加载失败, 重新加载........') selector = Selector(text=html) tag_tr_list = selector.xpath('//table[@id="tableSets"]/tbody/tr[position() > 1]') info_list = [] for tag_tr in tag_tr_list: set_name = tag_tr.xpath('./td[@class="text-left"]/a[1]/text()').get() set_name_url = tag_tr.xpath('./td[@class="text-left"]/a[1]/@href').get() set_name_url = urljoin(BASE_URL, set_name_url) if set_name_url else None set_id = set_name_url.split('/')[-1] if set_name_url else None # print(set_name, set_name_url) data_dict = { 'category': category, 'category_id': category_id, 'category_link': category_link, 'year': tag_year, 'year_link': tag_year_link, 'set_name': set_name, 'set_link': set_name_url, 'set_id': set_id } info_list.append(data_dict) # try: # get_player_list(log, data_dict, sql_pool) # except Exception as e: # log.error(f'get_player_list error: {e}') # 保存数据 if info_list: sql_pool.insert_many(table='psa_pop_player_sets', data_list=info_list, ignore=True) # 检查是否还有下一页 next_button = tab.ele('#tableSets_next') if next_button and 'disabled' not in next_button.attr('class'): # 点击下一页按钮 next_button.click() log.debug(f'{inspect.currentframe().f_code.co_name} -> 点击下一页按钮') # tab.wait.load_start() # 等待页面加载 # time.sleep(2) # 等待页面加载 else: # 没有下一页了,退出循环 log.debug(f'{inspect.currentframe().f_code.co_name} -> 没有下一页了, 退出循环, 最后页码: {page}') break page += 1 except Exception as e: log.error(f'get_response error: {e}') raise 'get_response error' finally: tab.quit() @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log) def get_years_data(log, category, category_id, category_link, sql_pool): """ 获取详情数据 :param log: :param category: :param category_id: :param category_link: :param sql_pool: """ try: with requests.Session() as session: resp = session.get(category_link, impersonate=random.choice(client_identifier_list), headers=headers, proxies=get_proxys(log), timeout=22, allow_redirects=False) # log.debug(resp.text) if 'Just a moment' in resp.text: log.debug('Just a moment , retrying.....') raise Exception('Just a moment') resp_selector = Selector(text=resp.text) tag_tr_list = resp_selector.xpath('//table[@id="tableCategory"]/tbody/tr') for tag_tr in tag_tr_list: tag_year = tag_tr.xpath('./td[1]/a/text()').get() tag_year_link = tag_tr.xpath('./td[1]/a/@href').get() tag_year_link = BASE_URL + tag_year_link if tag_year_link else tag_year_link try: get_sets_data(log, category, category_id, category_link, tag_year, tag_year_link, sql_pool) except Exception as e1: log.error(f"Error getting sets data: {e1}") except Exception as e: log.error(f"Error getting detail data: {e}") raise e @retry(stop=stop_after_attempt(5), wait=wait_fixed(3), after=after_log) def get_player_single_page(log, category_id, set_id, start, length, draw): """ 获取单页球员数据 :param log: 日志对象 :param category_id: 分类ID :param set_id: 集合ID :param start: 起始位置 :param length: 数据长度 :param draw: 请求序号 :return: 响应数据字典或None """ player_headers = { "accept": "application/json, text/javascript, */*; q=0.01", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "referer": f"https://www.psacard.com/pop/baseball-cards/2024/bowman/{set_id}", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", } url = "https://www.psacard.com/Pop/GetSetItems" data = { "draw": str(draw), "start": str(start), "length": str(length), "search": "", "headingID": str(set_id), "categoryID": str(category_id), "isPSADNA": "false" } try: response = requests.post( url, impersonate=random.choice(client_identifier_list), headers=player_headers, data=data, timeout=22, proxies=get_proxys(log) ) if response.status_code == 200: # print(response.json()) return response.json() elif response.status_code == 403: log.error(f"请求被拒绝,请检查IP地址是否被封禁, set_id: {set_id}") # return None raise Exception('请求被拒绝') else: log.error(f"请求失败,状态码: {response.status_code}, set_id: {set_id}") # return None raise Exception('请求失败') except Exception as e: log.error(f"获取单页球员数据出错, set_id: {set_id}, start: {start}, error: {e}") # return None raise Exception('获取单页球员数据出错') def get_player_list(log, category_id, set_id, sql_pool): """ 获取球员列表数据(带翻页功能) :param log: 日志对象 :param category_id: category_id :param set_id: set_id :param sql_pool: 数据库连接池 """ start = 0 # 起始位置 length = 300 # 每页数量 draw = 1 # 请求序列号 while True: log.debug(f"正在获取球员数据, category_id:{category_id}, set_id: {set_id}, page: {draw}, start: {start}") # 获取单页数据 try: response_data = get_player_single_page(log, category_id, set_id, start, length, draw) except Exception as e: log.error(f"获取单页球员数据出错, category_id:{category_id}, set_id: {set_id}, start: {start}, error: {e}") response_data = None if response_data is None: log.error(f"获取球员数据失败, category_id:{category_id}, set_id: {set_id}, page: {draw}, break !!!") # sql_pool.update_one_or_dict( # table='psa_pop_player_sets', # data={'player_state': 2}, # condition={'set_id': set_id} # ) break # 检查是否有数据返回 player_data_list = response_data.get('data') # 如果是第一页 删除第一条 if draw == 1: player_data_list.pop(0) if len(player_data_list) > 0: log.debug( f"获取到 {len(player_data_list)} 条球员数据, category_id:{category_id}, set_id: {set_id}, start: {start}") info_list = [] for pl_data in player_data_list: spec_id = pl_data.get('SpecID') card_number = pl_data.get('CardNumber') subject_name = pl_data.get('SubjectName') card_set = pl_data.get('Variety') grade_total = pl_data.get('GradeTotal') data_dict = { 'category_id': category_id, 'set_id': set_id, 'spec_id': spec_id, 'card_number': card_number, 'subject_name': subject_name, 'card_set': card_set, 'grade_total': grade_total } # print(f'data_dict:{data_dict}') info_list.append(data_dict) # 保存数据到数据库 if info_list: sql_pool.insert_many(table='psa_pop_player_record', data_list=info_list, ignore=True) # 如果返回数据少于请求长度,说明已是最后一页 if len(player_data_list) < length: log.debug(f"已到达最后一页, set_id: {set_id}") break # 更新参数准备下一页 start += length draw += 1 # 添加延迟避免请求过于频繁 time.sleep(1) else: log.debug(f"没有更多数据, set_id: {set_id}") break @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log) def pop_main(log): """ 主函数 """ log.info( f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................') # 配置 MySQL 连接池 sql_pool = MySQLConnectionPool(log=log) if not sql_pool.check_pool_health(): log.error("数据库连接池异常") raise RuntimeError("数据库连接池异常") try: log.debug(".......... 开始获取数据报告 ..........") for category, category_link in category_link_list.items(): log.debug(f"{category}第一次查询, 开始获取数据.......") try: category_id = category_link.split('/')[-1] get_years_data(log, category, category_id, category_link, sql_pool) except Exception as e1: log.error(f"Error getting detail data: {e1}") except Exception as e: log.error(f'{inspect.currentframe().f_code.co_name} error: {e}') finally: log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............') @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log) def player_main(log): """ 主函数 """ log.info( f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................') # 配置 MySQL 连接池 sql_pool = MySQLConnectionPool(log=log) if not sql_pool.check_pool_health(): log.error("数据库连接池异常") raise RuntimeError("数据库连接池异常") try: log.debug(".......... 开始获取数据报告 ..........") sql_sets_list = sql_pool.select_all( # "select category_id, set_id from psa_pop_player_sets where player_state = 0" "select category_id, set_id from psa_pop_player_sets") for category_set in sql_sets_list: category_id, set_id = category_set try: log.debug(f"category_id:{category_id}第一次查询, 开始获取数据.......") get_player_list(log, category_id, set_id, sql_pool) # sql_pool.update_one_or_dict( # table='psa_pop_player_sets', # data={'player_state': 1}, # condition={'set_id': set_id} # ) except Exception as e1: log.error(f"Error getting detail data: {e1}") # sql_pool.update_one_or_dict( # table='psa_pop_player_sets', # data={'player_state': 2}, # condition={'set_id': set_id} # ) except Exception as e: log.error(f'{inspect.currentframe().f_code.co_name} error: {e}') finally: log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............') def schedule_task(): """ 两个爬虫模块的启动文件 bidding_main weika_change_card_by_id_spider change_card_main """ # 立即运行一次任务 # pop_main(log=logger) # player_main(log=logger) # 设置定时任务 schedule.every().saturday.at("08:00").do(pop_main, log=logger) schedule.every().wednesday.at("08:00").do(player_main, log=logger) # schedule.every().day.at("00:30").do(player_main, log=logger) while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': # get_detail_data(logger, '','https://www.psacard.com/pop/bats',None) # get_sets_data(logger) # get_player_single_page(logger, '21172', '279481', 0, 300, 1) # aa_dict = { # 'category': 'baseball', # 'category_id': '20003', # 'category_link': 'https://www.psacard.com/pop/bats', # 'tag_year': '2004', # 'tag_year_link': '', # 'set_name': '', # 'set_name_url': '', # 'set_id': '279664' # } # get_player_list(logger, aa_dict, None) # pop_main(log=logger) player_main(log=logger) # schedule_task()