下每张图为

。
Args:
log: logger 对象。
session (requests.Session): curl_cffi 会话对象。
impersonate (str): 浏览器指纹标识。
detail_url (str): LotDetail.aspx?inventoryid=xxx 的绝对 URL。
Returns:
list[dict]: 每张图一条 {"large": 大图URL, "thumb": 缩略图URL};无图返回空列表。
"""
log.debug(f"获取详情图 {detail_url}")
resp = session.get(detail_url, headers=headers, impersonate=impersonate,
proxies=get_proxys(log), timeout=15)
resp.raise_for_status()
sel = Selector(resp.text)
panel = sel.css('div#ThumbPanel')
if not panel:
return []
images = []
for a in panel.css('a[href]'):
large = (a.attrib.get("href") or "").strip()
# 排除 highslide 功能性
(href="#")
if not large or large.startswith("#"):
continue
thumb = (a.css('img::attr(src)').get() or "").strip()
images.append({
"large": urljoin(detail_url, large), # 拼绝对 URL
"thumb": urljoin(detail_url, thumb) if thumb else "",
})
return images
def crawl_one_auction(log, sql_pool, session, impersonate,
auction_id, auction_name):
"""抓取单个 auction 的全部 lot 列表(只抓列表,不进详情页)。
与 lelands 一致的两阶段设计:本函数只负责列表入库;详情多图由后续
update_details_for_pending 扫库 state != 1 的记录单独补抓,二者分离。
Args:
log: logger 对象。
sql_pool: MySQL 连接池;传 None 时只返回数据,不入库。
session (requests.Session): curl_cffi 会话对象。
impersonate (str): 浏览器指纹标识。
auction_id (str): 目标 auction id。
auction_name (str): 目标 auction 显示名。
Returns:
list[dict]: 该 auction 全部 lot 列表数据(不含详情图,详情阶段再补)。
"""
lots = fetch_auction_lots(log, session, impersonate, auction_id, auction_name)
# 入库(接 mysql_pool 时此处会真正写库,state 默认 0 待补详情)
if sql_pool is not None and lots:
sql_pool.insert_many(table="wheatland_record", data_list=lots, ignore=True)
log.info(f"auction={auction_id}({auction_name}) 共抓 {len(lots)} 条 lot")
return lots
def get_details(log, url, sql_pool, sql_id):
"""对单条已入库记录补抓详情多图,写回 wheatland_record。
复用 fetch_lot_images 解析 #ThumbPanel;入库 imgs 字段存大图链接逗号拼接
(如需 thumb,可改存 fetch_lot_images 返回的 large+thumb JSON)。
Args:
log: logger 对象。
url (str): 详情页 URL。
sql_pool: MySQL 连接池。
sql_id: 数据库记录 id。
"""
log.info(f">>> 补抓详情 {url}")
impersonate = random.choice(client_identifier_list)
with requests.Session() as session:
images = fetch_lot_images(log, session, impersonate, url)
imgs_str = ",".join(img["large"] for img in images if img["large"]) if images else None
# print(imgs_str)
sql_pool.update_one_or_dict(
table="wheatland_record",
data={"imgs": imgs_str, "state": 1},
condition={"id": sql_id},
)
def update_details_for_pending(log, sql_pool):
"""扫库里 state != 1 的记录,逐条补抓详情图。
Args:
log: logger 对象。
sql_pool: MySQL 连接池。
"""
log.debug("Updating detail pages ...")
rows = sql_pool.select_all(
"select id, detail_url from wheatland_record where state != 1"
)
for row in rows:
sql_id, detail_url = row[0], row[1]
try:
get_details(log, detail_url, sql_pool, sql_id)
except Exception as e:
log.error(f"Error getting details for {detail_url}: {e}")
sql_pool.update_one_or_dict(
table="wheatland_record",
data={"state": 2},
condition={"id": sql_id},
)
if __name__ == '__main__':
get_details(logger,'https://wheatlandauctionservices.com/LotDetail.aspx?inventoryid=15233', None, 1)