# -*- coding: utf-8 -*- # Author : Charley # Python : 3.12.10 # Date : 2026/5/29 """ Wheatland 公用模块:HTTP 配置、ASP.NET postback 切换 auction、列表解析、详情图解析。 被 wheatland_history.py / wheatland_spider.py 复用。 目标网站: https://wheatlandauctionservices.com/auctionresults.aspx 逻辑要点: 1. ASP.NET WebForms (.aspx),每次 POST 都要带 __VIEWSTATE / __VIEWSTATEGENERATOR / __EVENTVALIDATION。 2. 顶部 "Select Auction" 下拉框 onchange 触发 __doPostBack(AuctionDDL,''),POST 回原 URL。 3. 切换 auction 后 #SearchGrid 表格直接渲染该场次全部 lot(实测无翻页)。 4. 详情页 LotDetail.aspx?inventoryid=xxx 的 #ThumbPanel 内即为多图链接。 """ import random from loguru import logger from parsel import Selector from curl_cffi import requests from curl_cffi.requests import BrowserType from urllib.parse import urljoin, urlparse, parse_qs from tenacity import retry, stop_after_attempt, wait_fixed BASE_URL = "https://wheatlandauctionservices.com/auctionresults.aspx" # 直接用库内置的所有浏览器指纹 client_identifier_list = [b.value for b in BrowserType] headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "en-US,en;q=0.9", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", } # ASP.NET 控件 name(前缀固定) AUCTION_DDL_NAME = "ctl00$ContentPlaceHolder$AuctionSelector$AuctionDDL" SEARCH_TB_NAME = "ctl00$ContentPlaceHolder$SearchTB" SEARCH_BY_DDL_NAME = "ctl00$ContentPlaceHolder$SearchByDDL" def after_log(retry_state): """tenacity retry 回调,统一打印重试日志。 Args: retry_state: tenacity.RetryCallState,retry 框架自动传入。 """ if retry_state.args and len(retry_state.args) > 0: log = retry_state.args[0] else: log = logger if retry_state.outcome.failed: log.warning(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times") else: log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded") @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def get_proxys(log): """获取代理字典,全部请求方法的 proxies 参数都走这里。 Args: log: logger 对象。 Returns: dict: requests 风格 {"http": ..., "https": ...} 代理字典;不需要代理时返回 None。 Raises: Exception: 透传内部异常以便 tenacity 触发重试。 """ http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931" try: return {"http": http_proxy, "https": https_proxy} except Exception as e: log.error(f"Error getting proxy: {e}") raise e def _pick_hidden(selector, field_name): """从页面提取 ASP.NET 隐藏字段(__VIEWSTATE 等)的 value。 Args: selector (Selector): parsel.Selector 对象。 field_name (str): 隐藏字段的 name,如 __VIEWSTATE。 Returns: str: 隐藏字段的值,未找到返回空字符串。 """ return selector.xpath(f'//input[@name="{field_name}"]/@value').get() or "" def extract_state(selector): """抽取 ASP.NET WebForms 三个隐藏字段。 Args: selector (Selector): 当前页面 parsel 解析对象。 Returns: dict: 含 __VIEWSTATE / __VIEWSTATEGENERATOR / __EVENTVALIDATION 三个键的字典。 Raises: ValueError: 当页面缺失任一隐藏字段时抛出(说明响应异常)。 """ state = {} for name in ("__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"): value = _pick_hidden(selector, name) if not value: raise ValueError(f"页面缺失隐藏字段: {name}") state[name] = value return state def parse_auction_options(selector): """从首页解析 Select Auction 下拉框,得到所有可选场次。 过滤规则: 跳过 value=-1 的 "All",以及名字里包含 "Test" 的测试场次。 Args: selector (Selector): 首页 GET 响应的 parsel 解析对象。 Returns: list[dict]: 每个元素为 {"id": "49", "name": "April 2026 ..."}。 Raises: ValueError: 当页面找不到 #AuctionDDL 下拉框时抛出。 """ select = selector.css('select#AuctionDDL') if not select: raise ValueError("找不到 #AuctionDDL 下拉框") result = [] for opt in select.css('option'): aid = (opt.attrib.get("value") or "").strip() name = opt.xpath('normalize-space(.)').get() or "" # 跳过 "All"(value=-1)、空 value、以及测试场次(名字含 "Test") if not aid or aid == "-1": continue if "test" in name.lower(): continue result.append({"id": aid, "name": name}) return result @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log) def get_auction_list(log, session, impersonate): """GET 首页,解析出全部 auction 列表(已过滤 All / Test)。 Args: log: logger 对象。 session (requests.Session): curl_cffi 会话对象。 impersonate (str): 浏览器指纹标识,与 setup 时一致。 Returns: list[dict]: [{"id": "49", "name": "April 2026 ..."}, ...]。 """ log.info("获取全部 auction 列表") resp = session.get(BASE_URL, headers=headers, impersonate=impersonate, proxies=get_proxys(log), timeout=15) resp.raise_for_status() sel = Selector(resp.text) auctions = parse_auction_options(sel) log.info(f"共解析到 {len(auctions)} 个 auction:{[(a['id'], a['name']) for a in auctions[:3]]}...") return auctions def parse_inventory_id(href): """从 LotDetail.aspx?inventoryid=xxx 中提取 inventoryid。 Args: href (str): 标签的 href 属性,例如 "LotDetail.aspx?inventoryid=35980"。 Returns: str: inventoryid 字符串,未匹配到时返回空串。 """ if not href: return "" qs = parse_qs(urlparse(href).query) return qs.get("inventoryid", [""])[0] def parse_search_grid(selector, auction_id, auction_name): """解析 #SearchGrid 表格所有 lot 行,每行组装成 dict。 表头固定为: Auction Name | Lot Number | Title | Min Bid | Final Price | Status。 Args: selector (Selector): POST 响应的 parsel 解析对象。 auction_id (str): 当前 auction 的 id,回填到行数据。 auction_name (str): 当前 auction 的显示名,回填到行数据。 Returns: list[dict]: 每行 lot 一条 dict;若表格不存在则返回空列表。 """ grid = selector.css('table#SearchGrid') if not grid: return [] rows = [] # 直系 tr,跳过第一行表头(class="color_c") trs = grid.xpath('./tr | ./tbody/tr')[1:] for tr in trs: tds = tr.xpath('./td') if len(tds) < 6: continue # 异常行跳过 # 第 2 列 Lot Number 含 lot_href = tds[1].css('a::attr(href)').get() or "" inventory_id = parse_inventory_id(lot_href) detail_url = urljoin(BASE_URL, lot_href) if lot_href else "" min_bid = tds[3].xpath('./text()').get() or "" final_price = tds[4].xpath('normalize-space(.)').get() or "" rows.append({ "auction_id": auction_id, # 场次 id "auction_name": auction_name, # 场次名 "lot_number": tds[1].xpath('normalize-space(.)').get() or "", # 第 2 列 Lot Number "inventory_id": inventory_id, # 从 LotDetail 链接抽出 "title": tds[2].xpath('normalize-space(.)').get() or "", # 第 3 列 Title "min_bid": min_bid.replace('$', '').replace(',', '') if min_bid else "", # 第 4 列 Min Bid "final_price": final_price.replace('$', '').replace(',', '') if final_price else "", # 第 5 列 Final Price "status": tds[5].xpath('normalize-space(.)').get() or "", # 第 6 列 Status "detail_url": detail_url, # 详情绝对 URL }) # print(rows) return rows @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log) def fetch_auction_lots(log, session, impersonate, auction_id, auction_name): """通过 __doPostBack 切换到指定 auction 并解析整张列表。 单次 POST 即返回该 auction 全部 lot(无翻页)。 Args: log: logger 对象。 session (requests.Session): curl_cffi 会话对象。 impersonate (str): 浏览器指纹标识。 auction_id (str): 目标 auction 的 option value。 auction_name (str): 目标 auction 的显示名。 Returns: list[dict]: parse_search_grid 解析出的 lot 列表。 """ log.info(f"切换并抓取 auction={auction_id} ({auction_name})") proxies = get_proxys(log) # 1) 首次 GET 拿 VIEWSTATE resp = session.get(BASE_URL, headers=headers, impersonate=impersonate, proxies=proxies, timeout=15) resp.raise_for_status() sel = Selector(resp.text) state = extract_state(sel) # 2) POST 切换 auction(等价于网页上 onchange 触发的 __doPostBack) form_data = { "__EVENTTARGET": AUCTION_DDL_NAME, # 触发 postback 的控件 "__EVENTARGUMENT": "", "__LASTFOCUS": "", "__VIEWSTATE": state["__VIEWSTATE"], "__VIEWSTATEGENERATOR": state["__VIEWSTATEGENERATOR"], "__EVENTVALIDATION": state["__EVENTVALIDATION"], AUCTION_DDL_NAME: str(auction_id), # 被选中的 auction value SEARCH_TB_NAME: "", # 搜索框留空 → 全量 SEARCH_BY_DDL_NAME: "1", # Search By 默认 Title } post_headers = { **headers, "content-type": "application/x-www-form-urlencoded", "referer": BASE_URL, "origin": "https://wheatlandauctionservices.com", } resp2 = session.post(BASE_URL, headers=post_headers, data=form_data, impersonate=impersonate, proxies=proxies, timeout=20) resp2.raise_for_status() sel2 = Selector(resp2.text) lots = parse_search_grid(sel2, str(auction_id), auction_name) log.info(f" auction={auction_id} 列表解析到 {len(lots)} 条 lot") return lots @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log) def fetch_lot_images(log, session, impersonate, detail_url): """进入 LotDetail.aspx 详情页,抓取 #ThumbPanel 内所有多图链接。 页面结构:
下每张图为 。 Args: log: logger 对象。 session (requests.Session): curl_cffi 会话对象。 impersonate (str): 浏览器指纹标识。 detail_url (str): LotDetail.aspx?inventoryid=xxx 的绝对 URL。 Returns: list[dict]: 每张图一条 {"large": 大图URL, "thumb": 缩略图URL};无图返回空列表。 """ log.debug(f"获取详情图 {detail_url}") resp = session.get(detail_url, headers=headers, impersonate=impersonate, proxies=get_proxys(log), timeout=15) resp.raise_for_status() sel = Selector(resp.text) panel = sel.css('div#ThumbPanel') if not panel: return [] images = [] for a in panel.css('a[href]'): large = (a.attrib.get("href") or "").strip() # 排除 highslide 功能性 (href="#") if not large or large.startswith("#"): continue thumb = (a.css('img::attr(src)').get() or "").strip() images.append({ "large": urljoin(detail_url, large), # 拼绝对 URL "thumb": urljoin(detail_url, thumb) if thumb else "", }) return images def crawl_one_auction(log, sql_pool, session, impersonate, auction_id, auction_name): """抓取单个 auction 的全部 lot 列表(只抓列表,不进详情页)。 与 lelands 一致的两阶段设计:本函数只负责列表入库;详情多图由后续 update_details_for_pending 扫库 state != 1 的记录单独补抓,二者分离。 Args: log: logger 对象。 sql_pool: MySQL 连接池;传 None 时只返回数据,不入库。 session (requests.Session): curl_cffi 会话对象。 impersonate (str): 浏览器指纹标识。 auction_id (str): 目标 auction id。 auction_name (str): 目标 auction 显示名。 Returns: list[dict]: 该 auction 全部 lot 列表数据(不含详情图,详情阶段再补)。 """ lots = fetch_auction_lots(log, session, impersonate, auction_id, auction_name) # 入库(接 mysql_pool 时此处会真正写库,state 默认 0 待补详情) if sql_pool is not None and lots: sql_pool.insert_many(table="wheatland_record", data_list=lots, ignore=True) log.info(f"auction={auction_id}({auction_name}) 共抓 {len(lots)} 条 lot") return lots def get_details(log, url, sql_pool, sql_id): """对单条已入库记录补抓详情多图,写回 wheatland_record。 复用 fetch_lot_images 解析 #ThumbPanel;入库 imgs 字段存大图链接逗号拼接 (如需 thumb,可改存 fetch_lot_images 返回的 large+thumb JSON)。 Args: log: logger 对象。 url (str): 详情页 URL。 sql_pool: MySQL 连接池。 sql_id: 数据库记录 id。 """ log.info(f">>> 补抓详情 {url}") impersonate = random.choice(client_identifier_list) with requests.Session() as session: images = fetch_lot_images(log, session, impersonate, url) imgs_str = ",".join(img["large"] for img in images if img["large"]) if images else None # print(imgs_str) sql_pool.update_one_or_dict( table="wheatland_record", data={"imgs": imgs_str, "state": 1}, condition={"id": sql_id}, ) def update_details_for_pending(log, sql_pool): """扫库里 state != 1 的记录,逐条补抓详情图。 Args: log: logger 对象。 sql_pool: MySQL 连接池。 """ log.debug("Updating detail pages ...") rows = sql_pool.select_all( "select id, detail_url from wheatland_record where state != 1" ) for row in rows: sql_id, detail_url = row[0], row[1] try: get_details(log, detail_url, sql_pool, sql_id) except Exception as e: log.error(f"Error getting details for {detail_url}: {e}") sql_pool.update_one_or_dict( table="wheatland_record", data={"state": 2}, condition={"id": sql_id}, ) # if __name__ == '__main__': # get_details(logger,'https://wheatlandauctionservices.com/LotDetail.aspx?inventoryid=15233', None, 1)