# -*- coding: utf-8 -*- # Author : Charley # Python : 3.12.10 # Date : 2026/5/21 """ Lelands 公用模块:HTTP 配置、ASP.NET postback 切换 auction、单页解析、详情解析。 被 lelands_history.py / lelands_spider.py 复用。 """ import random import re import user_agent from loguru import logger from parsel import Selector from curl_cffi import requests from curl_cffi.requests import BrowserType from tenacity import retry, stop_after_attempt, wait_fixed GALLERY_URL = "https://auction.lelands.com/lots/gallery/" # 直接用库内置的所有浏览器指纹 client_identifier_list = [b.value for b in BrowserType] headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "user-agent": user_agent.generate_user_agent() } def after_log(retry_state): """tenacity retry 回调""" if retry_state.args and len(retry_state.args) > 0: log = retry_state.args[0] else: log = logger if retry_state.outcome.failed: log.warning(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times") else: log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded") @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def get_proxys(log): """ 获取代理 :param log: logger 对象 :return: 代理字典 """ http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" try: return {"http": http_proxy, "https": https_proxy} except Exception as e: log.error(f"Error getting proxy: {e}") raise e def extract_auction(description, log=logger): """ 从 description 列表中提取 Auction 字段(双引号内的内容) :param description: selector.getall() 返回的字符串列表 :param log: logger 对象 :return: 提取到的 auction 字符串,失败返回 None """ try: if not description or not isinstance(description, list): return None for item in description: if not item or not isinstance(item, str): continue text = item.strip() if not text: continue match = re.search(r'"(.+?)"', text) if match: auction = match.group(1).strip() return auction if auction else None return None except Exception as e: log.error(f"extract_auction error: {e}") return None def _pick_hidden(selector, field_id): """ 从页面提取 ASP.NET 隐藏字段(__VIEWSTATE 等) :param selector: parsel.Selector 对象 :param field_id: 隐藏字段的 id,如 __VIEWSTATE :return: 隐藏字段的值,失败返回空字符串 """ return selector.xpath(f'//input[@id="{field_id}"]/@value').get() or '' def parse_auction_list(selector): """ 从 gallery 页面解析所有拍卖会下拉项 :param selector: parsel.Selector 对象 :return: [{"id": "-1", "name": "All Auctions"}, {"id": "1005", "name": "2026 Spring Classic"}, ...] """ options = selector.xpath('//select[@id="Auction"]/option') result = [] for opt in options: aid = opt.xpath('./@value').get() name = opt.xpath('./text()').get() if aid is None: continue result.append({"id": aid.strip(), "name": (name or '').strip()}) return result @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log) def get_auction_list(log, session, impersonate): """ GET gallery 首页,解析出全部拍卖会列表(排除 -1 All Auctions) :param log: logger 对象 :param session: requests.Session 对象 :param impersonate: 浏览器指纹标识(与 setup 时一致) :return: [{"id": "1005", "name": "2026 Spring Classic"}, ...] """ log.info("获取全部拍卖会列表") resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate, proxies=get_proxys(log), timeout=15) resp.raise_for_status() sel = Selector(resp.text) all_opts = parse_auction_list(sel) # 过滤掉 All Auctions(-1),只保留具体拍卖会 real = [o for o in all_opts if o["id"] != "-1"] log.info(f"共解析到 {len(real)} 个拍卖会:{[(o['id'], o['name']) for o in real]}") return real @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def setup_auction_session(log, session, impersonate, auction_id): """ 通过 ASP.NET __doPostBack 将 Auction 筛选切换到指定 auction_id。 切换后服务端 session 记住该选择,后续 GET /lots/gallery?page=N 都返回该 auction 数据。 :param log: logger 对象 :param session: requests.Session 对象 :param impersonate: 浏览器指纹标识(与 setup 时一致) :param auction_id: '-1'(All Auctions) 或具体 id 如 '1005' """ log.info(f"切换 Auction -> {auction_id}") proxies = get_proxys(log) # 1) 首次 GET 拿 ViewState resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate, proxies=proxies, timeout=15) resp.raise_for_status() sel = Selector(resp.text) form_data = { '__EVENTTARGET': 'ctl00$Auction', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': _pick_hidden(sel, '__VIEWSTATE'), '__VIEWSTATEGENERATOR': _pick_hidden(sel, '__VIEWSTATEGENERATOR'), '__EVENTVALIDATION': _pick_hidden(sel, '__EVENTVALIDATION'), 'ctl00$SearchIn': 'title', 'ctl00$SearchText': '', 'ctl00$BrowseBy': 'gallery', 'ctl00$Auction': str(auction_id), } post_headers = { **headers, 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': GALLERY_URL, 'Origin': 'https://auction.lelands.com', } resp = session.post(GALLERY_URL, headers=post_headers, data=form_data, impersonate=impersonate, proxies=proxies, timeout=20) resp.raise_for_status() # 验证切换是否成功 sel2 = Selector(resp.text) selected_val = sel2.xpath('//select[@id="Auction"]/option[@selected]/@value').get() log.info(f"切换后 Auction 选中值: {selected_val}") if selected_val != str(auction_id): raise RuntimeError(f"切换 Auction 失败,预期 {auction_id} 实际 {selected_val}") @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def get_single_page(log, page, sql_pool, session, impersonate, auction_id=None, auction_name=None): """ 获取单页数据 :param log: logger 对象 :param page: 页码 :param sql_pool: mysql连接池 :param session: requests.Session 对象 :param impersonate: 浏览器指纹标识(与 setup 时一致) :param auction_id: 当前 session 切换到的 auction id,会写入 lelands_record.auction_id :param auction_name: 同上,写入 lelands_record.auction_name :return: 该页解析到的条数 """ log.info(f">>>>>>>>>>>>>> 正在爬取 auction={auction_id}({auction_name}) 第 {page} 页 <<<<<<<<<<<<<<") response = session.get(GALLERY_URL, impersonate=impersonate, headers=headers, params={"page": f"{page}"}, proxies=get_proxys(log), timeout=10, allow_redirects=False) response.raise_for_status() selector = Selector(response.text) tag_div_list = selector.xpath( '//div[@class="items"]/div/div[@class="row"]//div[@class="col-lg-3 col-md-4 col-sm-6"]') if not tag_div_list or len(tag_div_list) == 0: log.warning(f"--------------- 第 {page} 页无数据 ---------------") return 0 info_list = [] for tag_div in tag_div_list: title = tag_div.xpath('.//p/a/text()').get() detail_url = tag_div.xpath('.//p/a/@href').get() tag_div_p = tag_div.xpath('.//div/p[2]/strong/text()').getall() bids = tag_div_p[0] if tag_div_p else None opening_bid = tag_div_p[1] if len(tag_div_p) > 1 else None opening_bid = opening_bid.replace('$', '').replace(',', '').strip() if opening_bid else None status = tag_div_p[2] if len(tag_div_p) > 2 else None price = tag_div.xpath('.//div[@class="item-price"]/a/text()').get() price = price.replace('SOLD FOR $', '').replace(',', '').strip() if price else None data_dict = { "title": title, "detail_url": detail_url, "bids": bids, "opening_bid": opening_bid, "status": status, "price": price, "auction_id": int(auction_id) if auction_id is not None else None, "auction_name": auction_name, } info_list.append(data_dict) if info_list and sql_pool is not None: sql_pool.insert_many(table="lelands_record", data_list=info_list, ignore=True) return len(info_list) def crawl_one_auction(log, sql_pool, session, impersonate, auction_id, auction_name, max_page=460): """ 抓取单个拍卖会的全部页(switch 到该 auction → 翻页直到无数据) :param log: logger 对象 :param sql_pool: mysql连接池 :param session: requests.Session 对象 :param impersonate: 浏览器指纹标识(与 setup 时一致) :param auction_id: 当前 session 切换到的 auction id,会写入 lelands_record.auction_id :param auction_name: 同上,写入 lelands_record.auction_name :param max_page: 最大页码 :return: 该 auction 抓到的总条数 """ setup_auction_session(log, session, impersonate, auction_id) page = 1 total = 0 while page <= max_page: try: n = get_single_page(log, page, sql_pool, session, impersonate, auction_id=auction_id, auction_name=auction_name) except Exception as e: log.error(f"auction={auction_id} page={page} 抓取失败: {e}") break if n == 0: log.info(f"auction={auction_id} 翻到第 {page} 页无数据,结束") break total += n page += 1 log.info(f"auction={auction_id}({auction_name}) 共抓取 {total} 条") return total @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def get_details(log, url, sql_pool, sql_id): """ 获取详情页:分类、auction 名称、图片列表,写回数据库 :param log: logger 对象 :param url: 详情页 URL :param sql_pool: mysql连接池 :param sql_id: 数据库记录 ID """ log.info(f">>>>>>>>>>>>>> 正在爬取详情数据URL: {url} <<<<<<<<<<<<<<") response = requests.get(url, headers=headers, impersonate=random.choice(client_identifier_list), timeout=10, proxies=get_proxys(log)) response.raise_for_status() selector = Selector(response.text) category = selector.xpath('//a[@id="MainContent_hCategory"]/text()').get() # description = selector.xpath('//*[@id="MainContent_lblOldAuction"]/text()').getall() # auction = extract_auction(description, log) imgs = selector.xpath('//div[@class="col-md-5 col-sm-5"]//a[not(@id="Zoomer")]/@href').getall() imgs = ','.join(imgs) if imgs else None sql_pool.update_one_or_dict( table="lelands_record", data={"category": category, "imgs": imgs, "state": 1}, condition={"id": sql_id} ) def update_details_for_pending(log, sql_pool): """ 扫描库里 state != 1 的记录,逐条抓详情 :param log: logger 对象 :param sql_pool: mysql连接池 """ log.debug('Updating detail pages ...........................') sql_result = sql_pool.select_all('select id, detail_url from lelands_record where state != 1') for row in sql_result: sql_id, detail_url = row[0], row[1] try: get_details(log, detail_url, sql_pool, sql_id) except Exception as e: log.error(f'Error getting details for {detail_url}: {e}') sql_pool.update_one_or_dict( table="lelands_record", data={"state": 2}, condition={"id": sql_id} )