| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- # -*- coding: utf-8 -*-
- # Author : Charley
- # Python : 3.12.10
- # Date : 2026/5/29
- """
- Wheatland 公用模块:HTTP 配置、ASP.NET postback 切换 auction、列表解析、详情图解析。
- 被 wheatland_history.py / wheatland_spider.py 复用。
- 目标网站: https://wheatlandauctionservices.com/auctionresults.aspx
- 逻辑要点:
- 1. ASP.NET WebForms (.aspx),每次 POST 都要带 __VIEWSTATE / __VIEWSTATEGENERATOR / __EVENTVALIDATION。
- 2. 顶部 "Select Auction" 下拉框 onchange 触发 __doPostBack(AuctionDDL,''),POST 回原 URL。
- 3. 切换 auction 后 #SearchGrid 表格直接渲染该场次全部 lot(实测无翻页)。
- 4. 详情页 LotDetail.aspx?inventoryid=xxx 的 #ThumbPanel 内即为多图链接。
- """
- import random
- from loguru import logger
- from parsel import Selector
- from curl_cffi import requests
- from curl_cffi.requests import BrowserType
- from urllib.parse import urljoin, urlparse, parse_qs
- from tenacity import retry, stop_after_attempt, wait_fixed
- BASE_URL = "https://wheatlandauctionservices.com/auctionresults.aspx"
- # 直接用库内置的所有浏览器指纹
- client_identifier_list = [b.value for b in BrowserType]
- headers = {
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "accept-language": "en-US,en;q=0.9",
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
- "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- }
- # ASP.NET 控件 name(前缀固定)
- AUCTION_DDL_NAME = "ctl00$ContentPlaceHolder$AuctionSelector$AuctionDDL"
- SEARCH_TB_NAME = "ctl00$ContentPlaceHolder$SearchTB"
- SEARCH_BY_DDL_NAME = "ctl00$ContentPlaceHolder$SearchByDDL"
- def after_log(retry_state):
- """tenacity retry 回调,统一打印重试日志。
- Args:
- retry_state: tenacity.RetryCallState,retry 框架自动传入。
- """
- if retry_state.args and len(retry_state.args) > 0:
- log = retry_state.args[0]
- else:
- log = logger
- if retry_state.outcome.failed:
- log.warning(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
- else:
- log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
- @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
- def get_proxys(log):
- """获取代理字典,全部请求方法的 proxies 参数都走这里。
- Args:
- log: logger 对象。
- Returns:
- dict: requests 风格 {"http": ..., "https": ...} 代理字典;不需要代理时返回 None。
- Raises:
- Exception: 透传内部异常以便 tenacity 触发重试。
- """
- http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
- https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
- try:
- return {"http": http_proxy, "https": https_proxy}
- except Exception as e:
- log.error(f"Error getting proxy: {e}")
- raise e
- def _pick_hidden(selector, field_name):
- """从页面提取 ASP.NET 隐藏字段(__VIEWSTATE 等)的 value。
- Args:
- selector (Selector): parsel.Selector 对象。
- field_name (str): 隐藏字段的 name,如 __VIEWSTATE。
- Returns:
- str: 隐藏字段的值,未找到返回空字符串。
- """
- return selector.xpath(f'//input[@name="{field_name}"]/@value').get() or ""
- def extract_state(selector):
- """抽取 ASP.NET WebForms 三个隐藏字段。
- Args:
- selector (Selector): 当前页面 parsel 解析对象。
- Returns:
- dict: 含 __VIEWSTATE / __VIEWSTATEGENERATOR / __EVENTVALIDATION 三个键的字典。
- Raises:
- ValueError: 当页面缺失任一隐藏字段时抛出(说明响应异常)。
- """
- state = {}
- for name in ("__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"):
- value = _pick_hidden(selector, name)
- if not value:
- raise ValueError(f"页面缺失隐藏字段: {name}")
- state[name] = value
- return state
- def parse_auction_options(selector):
- """从首页解析 Select Auction 下拉框,得到所有可选场次。
- 过滤规则: 跳过 value=-1 的 "All",以及名字里包含 "Test" 的测试场次。
- Args:
- selector (Selector): 首页 GET 响应的 parsel 解析对象。
- Returns:
- list[dict]: 每个元素为 {"id": "49", "name": "April 2026 ..."}。
- Raises:
- ValueError: 当页面找不到 #AuctionDDL 下拉框时抛出。
- """
- select = selector.css('select#AuctionDDL')
- if not select:
- raise ValueError("找不到 #AuctionDDL 下拉框")
- result = []
- for opt in select.css('option'):
- aid = (opt.attrib.get("value") or "").strip()
- name = opt.xpath('normalize-space(.)').get() or ""
- # 跳过 "All"(value=-1)、空 value、以及测试场次(名字含 "Test")
- if not aid or aid == "-1":
- continue
- if "test" in name.lower():
- continue
- result.append({"id": aid, "name": name})
- return result
- @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log)
- def get_auction_list(log, session, impersonate):
- """GET 首页,解析出全部 auction 列表(已过滤 All / Test)。
- Args:
- log: logger 对象。
- session (requests.Session): curl_cffi 会话对象。
- impersonate (str): 浏览器指纹标识,与 setup 时一致。
- Returns:
- list[dict]: [{"id": "49", "name": "April 2026 ..."}, ...]。
- """
- log.info("获取全部 auction 列表")
- resp = session.get(BASE_URL, headers=headers, impersonate=impersonate,
- proxies=get_proxys(log), timeout=15)
- resp.raise_for_status()
- sel = Selector(resp.text)
- auctions = parse_auction_options(sel)
- log.info(f"共解析到 {len(auctions)} 个 auction:{[(a['id'], a['name']) for a in auctions[:3]]}...")
- return auctions
- def parse_inventory_id(href):
- """从 LotDetail.aspx?inventoryid=xxx 中提取 inventoryid。
- Args:
- href (str): <a> 标签的 href 属性,例如 "LotDetail.aspx?inventoryid=35980"。
- Returns:
- str: inventoryid 字符串,未匹配到时返回空串。
- """
- if not href:
- return ""
- qs = parse_qs(urlparse(href).query)
- return qs.get("inventoryid", [""])[0]
- def parse_search_grid(selector, auction_id, auction_name):
- """解析 #SearchGrid 表格所有 lot 行,每行组装成 dict。
- 表头固定为: Auction Name | Lot Number | Title | Min Bid | Final Price | Status。
- Args:
- selector (Selector): POST 响应的 parsel 解析对象。
- auction_id (str): 当前 auction 的 id,回填到行数据。
- auction_name (str): 当前 auction 的显示名,回填到行数据。
- Returns:
- list[dict]: 每行 lot 一条 dict;若表格不存在则返回空列表。
- """
- grid = selector.css('table#SearchGrid')
- if not grid:
- return []
- rows = []
- # 直系 tr,跳过第一行表头(class="color_c")
- trs = grid.xpath('./tr | ./tbody/tr')[1:]
- for tr in trs:
- tds = tr.xpath('./td')
- if len(tds) < 6:
- continue # 异常行跳过
- # 第 2 列 Lot Number 含 <a href='LotDetail.aspx?inventoryid=xxx'>
- lot_href = tds[1].css('a::attr(href)').get() or ""
- inventory_id = parse_inventory_id(lot_href)
- detail_url = urljoin(BASE_URL, lot_href) if lot_href else ""
- min_bid = tds[3].xpath('./text()').get() or ""
- final_price = tds[4].xpath('normalize-space(.)').get() or ""
- rows.append({
- "auction_id": auction_id, # 场次 id
- "auction_name": auction_name, # 场次名
- "lot_number": tds[1].xpath('normalize-space(.)').get() or "", # 第 2 列 Lot Number
- "inventory_id": inventory_id, # 从 LotDetail 链接抽出
- "title": tds[2].xpath('normalize-space(.)').get() or "", # 第 3 列 Title
- "min_bid": min_bid.replace('$', '').replace(',', '') if min_bid else "", # 第 4 列 Min Bid
- "final_price": final_price.replace('$', '').replace(',', '') if final_price else "", # 第 5 列 Final Price
- "status": tds[5].xpath('normalize-space(.)').get() or "", # 第 6 列 Status
- "detail_url": detail_url, # 详情绝对 URL
- })
- # print(rows)
- return rows
- @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
- def fetch_auction_lots(log, session, impersonate, auction_id, auction_name):
- """通过 __doPostBack 切换到指定 auction 并解析整张列表。
- 单次 POST 即返回该 auction 全部 lot(无翻页)。
- Args:
- log: logger 对象。
- session (requests.Session): curl_cffi 会话对象。
- impersonate (str): 浏览器指纹标识。
- auction_id (str): 目标 auction 的 option value。
- auction_name (str): 目标 auction 的显示名。
- Returns:
- list[dict]: parse_search_grid 解析出的 lot 列表。
- """
- log.info(f"切换并抓取 auction={auction_id} ({auction_name})")
- proxies = get_proxys(log)
- # 1) 首次 GET 拿 VIEWSTATE
- resp = session.get(BASE_URL, headers=headers, impersonate=impersonate,
- proxies=proxies, timeout=15)
- resp.raise_for_status()
- sel = Selector(resp.text)
- state = extract_state(sel)
- # 2) POST 切换 auction(等价于网页上 onchange 触发的 __doPostBack)
- form_data = {
- "__EVENTTARGET": AUCTION_DDL_NAME, # 触发 postback 的控件
- "__EVENTARGUMENT": "",
- "__LASTFOCUS": "",
- "__VIEWSTATE": state["__VIEWSTATE"],
- "__VIEWSTATEGENERATOR": state["__VIEWSTATEGENERATOR"],
- "__EVENTVALIDATION": state["__EVENTVALIDATION"],
- AUCTION_DDL_NAME: str(auction_id), # 被选中的 auction value
- SEARCH_TB_NAME: "", # 搜索框留空 → 全量
- SEARCH_BY_DDL_NAME: "1", # Search By 默认 Title
- }
- post_headers = {
- **headers,
- "content-type": "application/x-www-form-urlencoded",
- "referer": BASE_URL,
- "origin": "https://wheatlandauctionservices.com",
- }
- resp2 = session.post(BASE_URL, headers=post_headers, data=form_data,
- impersonate=impersonate, proxies=proxies, timeout=20)
- resp2.raise_for_status()
- sel2 = Selector(resp2.text)
- lots = parse_search_grid(sel2, str(auction_id), auction_name)
- log.info(f" auction={auction_id} 列表解析到 {len(lots)} 条 lot")
- return lots
- @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log)
- def fetch_lot_images(log, session, impersonate, detail_url):
- """进入 LotDetail.aspx 详情页,抓取 #ThumbPanel 内所有多图链接。
- 页面结构: <div id="ThumbPanel"> 下每张图为
- <a href="/ItemImages/.../xxx_lg.jpeg"><img src="/ItemImages/.../xxx_med.jpeg"></a>。
- Args:
- log: logger 对象。
- session (requests.Session): curl_cffi 会话对象。
- impersonate (str): 浏览器指纹标识。
- detail_url (str): LotDetail.aspx?inventoryid=xxx 的绝对 URL。
- Returns:
- list[dict]: 每张图一条 {"large": 大图URL, "thumb": 缩略图URL};无图返回空列表。
- """
- log.debug(f"获取详情图 {detail_url}")
- resp = session.get(detail_url, headers=headers, impersonate=impersonate,
- proxies=get_proxys(log), timeout=15)
- resp.raise_for_status()
- sel = Selector(resp.text)
- panel = sel.css('div#ThumbPanel')
- if not panel:
- return []
- images = []
- for a in panel.css('a[href]'):
- large = (a.attrib.get("href") or "").strip()
- # 排除 highslide 功能性 <a>(href="#")
- if not large or large.startswith("#"):
- continue
- thumb = (a.css('img::attr(src)').get() or "").strip()
- images.append({
- "large": urljoin(detail_url, large), # 拼绝对 URL
- "thumb": urljoin(detail_url, thumb) if thumb else "",
- })
- return images
- def crawl_one_auction(log, sql_pool, session, impersonate,
- auction_id, auction_name):
- """抓取单个 auction 的全部 lot 列表(只抓列表,不进详情页)。
- 与 lelands 一致的两阶段设计:本函数只负责列表入库;详情多图由后续
- update_details_for_pending 扫库 state != 1 的记录单独补抓,二者分离。
- Args:
- log: logger 对象。
- sql_pool: MySQL 连接池;传 None 时只返回数据,不入库。
- session (requests.Session): curl_cffi 会话对象。
- impersonate (str): 浏览器指纹标识。
- auction_id (str): 目标 auction id。
- auction_name (str): 目标 auction 显示名。
- Returns:
- list[dict]: 该 auction 全部 lot 列表数据(不含详情图,详情阶段再补)。
- """
- lots = fetch_auction_lots(log, session, impersonate, auction_id, auction_name)
- # 入库(接 mysql_pool 时此处会真正写库,state 默认 0 待补详情)
- if sql_pool is not None and lots:
- sql_pool.insert_many(table="wheatland_record", data_list=lots, ignore=True)
- log.info(f"auction={auction_id}({auction_name}) 共抓 {len(lots)} 条 lot")
- return lots
- def get_details(log, url, sql_pool, sql_id):
- """对单条已入库记录补抓详情多图,写回 wheatland_record。
- 复用 fetch_lot_images 解析 #ThumbPanel;入库 imgs 字段存大图链接逗号拼接
- (如需 thumb,可改存 fetch_lot_images 返回的 large+thumb JSON)。
- Args:
- log: logger 对象。
- url (str): 详情页 URL。
- sql_pool: MySQL 连接池。
- sql_id: 数据库记录 id。
- """
- log.info(f">>> 补抓详情 {url}")
- impersonate = random.choice(client_identifier_list)
- with requests.Session() as session:
- images = fetch_lot_images(log, session, impersonate, url)
- imgs_str = ",".join(img["large"] for img in images if img["large"]) if images else None
- # print(imgs_str)
- sql_pool.update_one_or_dict(
- table="wheatland_record",
- data={"imgs": imgs_str, "state": 1},
- condition={"id": sql_id},
- )
- def update_details_for_pending(log, sql_pool):
- """扫库里 state != 1 的记录,逐条补抓详情图。
- Args:
- log: logger 对象。
- sql_pool: MySQL 连接池。
- """
- log.debug("Updating detail pages ...")
- rows = sql_pool.select_all(
- "select id, detail_url from wheatland_record where state != 1"
- )
- for row in rows:
- sql_id, detail_url = row[0], row[1]
- try:
- get_details(log, detail_url, sql_pool, sql_id)
- except Exception as e:
- log.error(f"Error getting details for {detail_url}: {e}")
- sql_pool.update_one_or_dict(
- table="wheatland_record",
- data={"state": 2},
- condition={"id": sql_id},
- )
- # if __name__ == '__main__':
- # get_details(logger,'https://wheatlandauctionservices.com/LotDetail.aspx?inventoryid=15233', None, 1)
|