# -*- coding: utf-8 -*- # Author : Charley # Python : 3.10.8 # Date : 2025/8/25 14:38 import inspect from datetime import datetime import requests import user_agent from loguru import logger from parsel import Selector from tenacity import retry, stop_after_attempt, wait_fixed from mysql_pool import MySQLConnectionPool crawler_language = "繁中" headers = { # "referer": "https://asia.pokemon-card.com/tw/card-search/list/", "user-agent": user_agent.generate_user_agent() } logger.remove() logger.add("./logs/fan_{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00", format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}", level="DEBUG", retention="7 day") """ expansion_series -> """ def after_log(retry_state): """ retry 回调 :param retry_state: RetryCallState 对象 """ # 检查 args 是否存在且不为空 if retry_state.args and len(retry_state.args) > 0: log = retry_state.args[0] # 获取传入的 logger else: log = logger # 使用全局 logger if retry_state.outcome.failed: log.warning( f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times") else: log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded") @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log) def get_proxys(log): """ 获取代理 :return: 代理 """ tunnel = "x371.kdltps.com:15818" kdl_username = "t13753103189895" kdl_password = "o0yefv6z" try: proxies = { "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}, "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel} } return proxies except Exception as e: log.error(f"Error getting proxy: {e}") raise e @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log) def get_category_list(log, sql_pool): log.debug(f'Request get_category_list.........................') url = 'https://asia.pokemon-card.com/tw/card-search/' page = 1 max_page = 100 while page <= max_page: params = { # "pageNo": "2" "pageNo": page } response = requests.get(url, headers=headers, params=params, timeout=10) response.raise_for_status() selector = Selector(response.text) tag_li_list = selector.xpath('//ul[@class="expansionList"]/li') info_list = [] for tag_li in tag_li_list: expansionLink = tag_li.xpath('./a/@href').get() expansion_link = f'https://asia.pokemon-card.com{expansionLink}' if expansionLink else None expansion_img = tag_li.xpath('./a//img/@src').get() expansion_series = tag_li.xpath('./a//div[@class="seriesBlock"]/span/text()').get() expansion_title = tag_li.xpath('./a//div[@class="titleBlock"]/h3/text()').get() expansion_title = expansion_title.strip() if expansion_title else None expansion_release_time = tag_li.xpath('./a//div[@class="titleBlock"]/time/@datetime').get() data_dict = { "expansion_link": expansion_link, "expansion_img": expansion_img, "expansion_series": expansion_series, "expansion_title": expansion_title, "expansion_release_time": expansion_release_time, "crawler_language": crawler_language } # print(data_dict) info_list.append(data_dict) if info_list: sql_pool.insert_many(table="pokemon_fanz_category", data_list=info_list, ignore=True) if not tag_li_list: log.debug(f'not tag_li_list!!! page: {page}!!!!!!!!!!') break if len(tag_li_list) < 20: log.debug( f'--------------- page {page} has {len(tag_li_list)} items, [len(tag_li_list) < 20] ->->-> break ---------------') break page += 1 # ----------------------------------------------------------------------------------------------------------------------- @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log) def get_list_single_page(log, sql_pool, page, cate_tuple): # expansion_link,expansion_series,expansion_title,expansion_release_time url = cate_tuple[0] expansion_series = cate_tuple[1] expansion_title = cate_tuple[2] expansion_release_time = cate_tuple[3] # expansion_release_time 为'03-28-2025'格式 转换为正常的 年月日 date_obj = datetime.strptime(expansion_release_time, '%m-%d-%Y') expansion_release_time = date_obj.strftime('%Y-%m-%d') log.debug(f'Request get_list_single_page for page: {page}') # url = "https://asia.pokemon-card.com/tw/card-search/list/" params = { # "pageNo": "2" "pageNo": page } # response = requests.get(url, headers=headers, params=params, timeout=10, proxies=get_proxys(log)) response = requests.get(url, headers=headers, params=params, timeout=10) # print(response.text) response.raise_for_status() selector = Selector(response.text) tag_li_list = selector.xpath('//*[@id="searchForm"]//ul/li') info_list = [] for tag_li in tag_li_list: detail_url_str = tag_li.xpath('./a/@href').get() card_id = detail_url_str.split('/')[-2] detail_url = f"https://asia.pokemon-card.com{detail_url_str}" img = tag_li.xpath('.//img/@src').get() if not img: img = tag_li.xpath('.//img/@data-original').get() data_dict = { "card_id": card_id, "major_category_name": expansion_series, "pg_label": expansion_title, "sales_date": expansion_release_time, "detail_url": detail_url, "img": img, "crawler_language": crawler_language } # print(data_dict) info_list.append(data_dict) if info_list: sql_pool.insert_many(table="pokemon_card_record", data_list=info_list, ignore=True) return len(tag_li_list) def get_data_list(log, sql_pool, cate_tuple): page = 1 max_page = 600 while page <= max_page: try: log.debug( f'--------------- {inspect.currentframe().f_code.co_name}, page {page}, start ---------------') len_items = get_list_single_page(log, sql_pool, page, cate_tuple) except Exception as e: log.error( f"{inspect.currentframe().f_code.co_name} Request get_list_single_page for page:{page}, {e}") len_items = 0 if len_items < 20: log.debug(f'--------------- page {page} has {len_items} items, break ---------------') break if page > 50: log.debug(f'--------------- page {page} has {len_items} items, [page > 50] ->->-> break ---------------') break page += 1 # ---------------------------------------------------------------------------------------------------------------------- @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log) def get_details(log, sql_id_detail_url: tuple, sql_pool): log.debug(f'Request get_details for sql_id_detail_url: {sql_id_detail_url}') # url = "https://asia.pokemon-card.com/tw/card-search/detail/13958/" url = sql_id_detail_url[1] # response = requests.get(url, headers=headers, timeout=10, proxies=get_proxys(log)) response = requests.get(url, headers=headers, timeout=10) # print(response.text) response.raise_for_status() selector = Selector(response.text) card_name = selector.xpath('//div[@class="wrapper"]/header/h1/text()').getall() card_name = ''.join(card_name) if card_name else None card_name = card_name.strip() if card_name else None card_no = selector.xpath('//div[@class="wrapper"]//span[@class="collectorNumber"]/text()').get() card_no = card_no.strip() if card_no else None data_dict = { "card_name": card_name, "card_no": card_no } # print(data_dict) sql_pool.update_one_or_dict( table="pokemon_card_record", data=data_dict, condition={"id": sql_id_detail_url[0]} ) @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log) def fz_pokemon_main(log): """ 主函数 """ log.info(f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务.............................................') # 配置 MySQL 连接池 sql_pool = MySQLConnectionPool(log=log) if not sql_pool.check_pool_health(): log.error("数据库连接池异常") raise RuntimeError("数据库连接池异常") try: # 获取分类列表 log.debug(".......... 获取分类列表 ..........") try: get_category_list(logger, sql_pool) except Exception as e: log.error(f"{inspect.currentframe().f_code.co_name} Request get_category_list error: {e}") # 获取每个分类下的 产品列表 sql_cate_list = sql_pool.select_all( f"SELECT expansion_link,expansion_series,expansion_title,expansion_release_time FROM pokemon_fanz_category WHERE crawler_language='{crawler_language}'") # sql_cate_list = [x[0] for x in sql_cate_list] for cate_tuple in sql_cate_list: try: # 获取商品列表 https://asia.pokemon-card.com/tw/card-search/list/?expansionCodes=M1L log.debug(f'Request get_data_list for cate: {cate_tuple}') get_data_list(logger, sql_pool, cate_tuple) except Exception as e: log.error(f"{inspect.currentframe().f_code.co_name} Request get_data_list error: {e}") # 获取商品详情 log.debug(f"........... 获取商品详情 ..........") sql_ietm_id_list = sql_pool.select_all( f"SELECT id, detail_url FROM pokemon_card_record WHERE card_name IS NULL AND crawler_language='{crawler_language}'") for item_id in sql_ietm_id_list: try: get_details(log, item_id, sql_pool) except Exception as e: log.error(f"Request get_details error: {e}") except Exception as e: log.error(f'{inspect.currentframe().f_code.co_name} error: {e}') finally: log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............') if __name__ == '__main__': # get_list_single_page(logger, None, 1, 'https://asia.pokemon-card.com/tw/card-search/list/?expansionCodes=M1L') # get_details(logger, (None, None), None) # get_category_list(logger) fz_pokemon_main(logger)