# -*- coding: utf-8 -*- # Author : Charley # Python : 3.10.8 # Date : 2025/9/15 14:53 import inspect import random import time import schedule import user_agent from curl_cffi import requests from loguru import logger from parsel import Selector from mysql_pool import MySQLConnectionPool from tenacity import retry, stop_after_attempt, wait_fixed logger.remove() logger.add("logs/pop_{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00", format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}", level="DEBUG", retention="3 day") client_identifier_list = [ "edge99", "edge101", "chrome99", "chrome100", "chrome101", "chrome104", "chrome107", "chrome110", "chrome116", "chrome119", "chrome120", "chrome123", "chrome124", "chrome99_android", "safari15_3", "safari15_5", "safari17_0", "safari17_2_ios" ] BASE_URL = 'https://www.psacard.com' headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 'User-Agent': user_agent.generate_user_agent() } category_link_list = {'Baseball Cards': 'https://www.psacard.com/pop/baseball-cards/20003', 'Baseball Coins': 'https://www.psacard.com/pop/baseball-coins/82797', 'Basketball Cards': 'https://www.psacard.com/pop/basketball-cards/20019', 'Basketball Coins': 'https://www.psacard.com/pop/basketball-coins/83007', 'Boxing / Wrestling Cards / MMA': 'https://www.psacard.com/pop/boxing-wrestling-cards-mma/20021', 'Football Cards': 'https://www.psacard.com/pop/football-cards/20014', 'Football Coins': 'https://www.psacard.com/pop/football-coins/83011', 'Golf Cards': 'https://www.psacard.com/pop/golf-cards/20023', 'Hockey Cards': 'https://www.psacard.com/pop/hockey-cards/20020', 'Hockey Coins': 'https://www.psacard.com/pop/hockey-coins/83012', 'Minor League Cards': 'https://www.psacard.com/pop/minor-league-cards/20031', 'Misc Cards': 'https://www.psacard.com/pop/misc-cards/20033', 'Multi-Sport Cards': 'https://www.psacard.com/pop/multi-sport-cards/20006', 'Multi-Sport Coins': 'https://www.psacard.com/pop/multi-sport-coins/102825', 'Non-Sport Cards': 'https://www.psacard.com/pop/non-sport-cards/20032', 'Non-Sport Coins': 'https://www.psacard.com/pop/non-sport-coins/82981', 'Packs': 'https://www.psacard.com/pop/packs/20017', 'Pins': 'https://www.psacard.com/pop/pins/20013', 'Soccer Cards': 'https://www.psacard.com/pop/soccer-cards/20004', 'TCG Cards': 'https://www.psacard.com/pop/tcg-cards/156940', 'Tickets': 'https://www.psacard.com/pop/tickets/20022', 'Game-Used Bats': 'https://www.psacard.com/pop/bats'} def after_log(retry_state): """ retry 回调 :param retry_state: RetryCallState 对象 """ # 检查 args 是否存在且不为空 if retry_state.args and len(retry_state.args) > 0: log = retry_state.args[0] # 获取传入的 logger else: log = logger # 使用全局 logger if retry_state.outcome.failed: log.warning( f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times") else: log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded") @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def get_proxys(log): # 已购买账户 北美 # http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927" # https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927" http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" try: proxySettings = { "http": http_proxy, "https": https_proxy, } return proxySettings except Exception as e: log.error(f"Error getting proxy: {e}") raise e @retry(stop=stop_after_attempt(10), wait=wait_fixed(3), after=after_log) def get_detail_data(log, category, link, sql_pool): """ 获取详情数据 :param log: :param category: :param link: :param sql_pool: """ try: with requests.Session() as session: resp = session.get(link, impersonate=random.choice(client_identifier_list), headers=headers, proxies=get_proxys(log), timeout=22, allow_redirects=False) # log.debug(resp.text) if 'Just a moment' in resp.text: log.debug('Just a moment , retrying.....') raise Exception('Just a moment') resp_selector = Selector(text=resp.text) tag_td_list = resp_selector.xpath( '//*[@id="tableCategory"]/thead/tr/td[@class="text-right"]/text() | //*[@id="tableBats"]/thead/tr/td[@class="text-right"]/text()').getall() if tag_td_list: # number_of_sets = tag_td_list[0] # total_items = tag_td_list[1] total_graded = tag_td_list[-1] if total_graded: total_graded = total_graded.replace(',', '') log.debug(f"Total Graded: {total_graded}") data_dict = { "category": category, "category_link": link, "total_graded": total_graded, "crawl_date": time.strftime("%Y-%m-%d", time.localtime()) } # print(data_dict) try: sql_pool.insert_one_or_dict(table="psa_pop_record", data=data_dict, ignore=True) except Exception as e1: log.error(f"Error inserting data: {e1}") except Exception as e: log.error(f"Error getting detail data: {e}") raise e @retry(stop=stop_after_attempt(10), wait=wait_fixed(3), after=after_log) def get_pop_data(log, sql_pool): """ 获取 pop 列表页数据 :param log: :param sql_pool: """ url = "https://www.psacard.com/pop" try: with requests.Session() as session: resp = session.get(url, impersonate=random.choice(client_identifier_list), headers=headers, proxies=get_proxys(log), timeout=22, allow_redirects=False) # log.debug(resp.text) if 'Just a moment' in resp.text: log.debug('Just a moment , retrying.....') raise Exception('Just a moment') resp_selector = Selector(text=resp.text) tag_a_list = resp_selector.xpath('//*[@id="mainContent"]/div[2]/div/a') for tag_a in tag_a_list: category = tag_a.xpath('./div/div/text()').get() category_link = tag_a.xpath('./@href').get() if 'https://' not in category_link: category_link = BASE_URL + category_link log.debug(f"Category: {category}, Link: {category_link}") try: get_detail_data(log, category, category_link, sql_pool) except Exception as e1: log.error(f"Error getting detail data: {e1}") except Exception as e: log.error(f"Error getting pop data: {e}") raise e @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log) def pop_main(log): """ 主函数 """ log.info( f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................') # 配置 MySQL 连接池 sql_pool = MySQLConnectionPool(log=log) if not sql_pool.check_pool_health(): log.error("数据库连接池异常") raise RuntimeError("数据库连接池异常") try: log.debug(".......... 开始获取数据报告 ..........") # get_pop_data(log, sql_pool) for _ in range(2): for category, category_link in category_link_list.items(): log.debug(f"{category}第一次查询, 开始获取数据.......") try: get_detail_data(log, category, category_link, sql_pool) except Exception as e1: log.error(f"Error getting detail data: {e1}") time.sleep(5) except Exception as e: log.error(f'{inspect.currentframe().f_code.co_name} error: {e}') finally: log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............') def schedule_task(): """ 两个爬虫模块的启动文件 bidding_main weika_change_card_by_id_spider change_card_main """ # 立即运行一次任务 pop_main(log=logger) # 设置定时任务 schedule.every().day.at("08:00").do(pop_main, log=logger) while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': # get_pop_data(logger) schedule_task() # get_detail_data(logger, '','https://www.psacard.com/pop/bats',None)