auctions90s_core.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.12.10
  4. # Date : 2026/5/28
  5. """
  6. 90sAuctions 公用模块:HTTP 配置、ASP.NET postback 切换 auction、单页解析、详情解析。
  7. 被 auctions90s_history.py / auctions90s_spider.py / auctions90s_retry.py 复用。
  8. """
  9. import random
  10. import re
  11. from curl_cffi import requests
  12. import user_agent
  13. from loguru import logger
  14. from parsel import Selector
  15. from curl_cffi.requests import BrowserType
  16. from tenacity import retry, stop_after_attempt, wait_fixed
  17. # 站点常量
  18. SITE_ORIGIN = "https://90sauctions.com"
  19. GALLERY_URL = f"{SITE_ORIGIN}/Lots/Gallery"
  20. # 数据库表名(结构复制自 lelands_record)
  21. TABLE_NAME = "auctions90s_record"
  22. # 直接用库内置的所有浏览器指纹
  23. client_identifier_list = [b.value for b in BrowserType]
  24. headers = {
  25. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  26. "user-agent": user_agent.generate_user_agent()
  27. }
  28. # 列表页价格前缀:进行中为 "CURRENT BID $...",已结束为 "SOLD FOR $..."
  29. PRICE_PREFIX_RE = re.compile(r'^\s*(?:SOLD\s+FOR|CURRENT\s+BID)\s*\$', re.IGNORECASE)
  30. def after_log(retry_state):
  31. """tenacity retry 回调"""
  32. if retry_state.args and len(retry_state.args) > 0:
  33. log = retry_state.args[0]
  34. else:
  35. log = logger
  36. if retry_state.outcome.failed:
  37. log.warning(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  38. else:
  39. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  40. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  41. def get_proxys(log):
  42. """
  43. 获取代理。
  44. :param log: (loguru.Logger) 日志对象,用于记录代理获取异常
  45. :return: (dict) 形如 {"http": "...", "https": "..."} 的代理字典
  46. :raises Exception: 当代理服务不可达时由 tenacity 触发重试,最终抛出
  47. """
  48. http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  49. https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  50. try:
  51. return {"http": http_proxy, "https": https_proxy}
  52. except Exception as e:
  53. log.error(f"Error getting proxy: {e}")
  54. raise e
  55. def extract_auction(description, log=logger):
  56. """
  57. 从详情页 lblOldAuction 的文本列表中提取拍卖会名称(双引号内的字符串)。
  58. :param description: (list[str]) selector.getall() 返回的字符串列表,
  59. 典型形如 ['Item was in Auction "Inaugural Auction",...']
  60. :param log: (loguru.Logger) 日志对象
  61. :return: (str | None) 提取到的 auction 名,失败或空时返回 None
  62. """
  63. try:
  64. if not description or not isinstance(description, list):
  65. return None
  66. for item in description:
  67. if not item or not isinstance(item, str):
  68. continue
  69. text = item.strip()
  70. if not text:
  71. continue
  72. match = re.search(r'"(.+?)"', text)
  73. if match:
  74. auction = match.group(1).strip()
  75. return auction if auction else None
  76. return None
  77. except Exception as e:
  78. log.error(f"extract_auction error: {e}")
  79. return None
  80. def _pick_hidden(selector, field_id):
  81. """
  82. 从 ASP.NET 页面中提取隐藏字段值(__VIEWSTATE / __EVENTVALIDATION 等)。
  83. :param selector: (parsel.Selector) 已解析的页面 Selector
  84. :param field_id: (str) 隐藏字段的 id,如 "__VIEWSTATE"
  85. :return: (str) 字段值,未找到时返回空字符串
  86. """
  87. return selector.xpath(f'//input[@id="{field_id}"]/@value').get() or ''
  88. def parse_auction_list(selector):
  89. """
  90. 解析 Gallery 页面侧边栏 Auction 下拉框中的全部选项。
  91. :param selector: (parsel.Selector) Gallery 首页 Selector
  92. :return: (list[dict]) [{"id": "-1", "name": "All Auctions"}, {"id": "12", "name": "June 2026"}, ...]
  93. """
  94. options = selector.xpath('//select[@id="Auction"]/option')
  95. result = []
  96. for opt in options:
  97. aid = opt.xpath('./@value').get()
  98. name = opt.xpath('./text()').get()
  99. if aid is None:
  100. continue
  101. result.append({"id": aid.strip(), "name": (name or '').strip()})
  102. return result
  103. @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log)
  104. def get_auction_list(log, session, impersonate):
  105. """
  106. GET Gallery 首页并解析出全部具体拍卖会(排除 -1 All Auctions)。
  107. :param log: (loguru.Logger) 日志对象
  108. :param session: (curl_cffi.requests.Session) 复用的会话对象
  109. :param impersonate: (str) curl_cffi 浏览器指纹标识
  110. :return: (list[dict]) [{"id": "12", "name": "June 2026"}, ...]
  111. :raises requests.HTTPError: 首页请求非 2xx 时抛出
  112. """
  113. log.info("获取全部拍卖会列表")
  114. resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate,
  115. proxies=get_proxys(log), timeout=15)
  116. resp.raise_for_status()
  117. sel = Selector(resp.text)
  118. all_opts = parse_auction_list(sel)
  119. # 过滤掉 All Auctions(-1),只保留具体拍卖会
  120. real = [o for o in all_opts if o["id"] != "-1"]
  121. log.info(f"共解析到 {len(real)} 个拍卖会:{[(o['id'], o['name']) for o in real]}")
  122. return real
  123. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  124. def setup_auction_session(log, session, impersonate, auction_id):
  125. """
  126. 通过 ASP.NET __doPostBack 把 Gallery 的 Auction 筛选切换到指定 auction_id。
  127. 切换后服务端 session 记住该选择,后续 GET /Lots/Gallery?page=N 都返回该 auction 数据。
  128. :param log: (loguru.Logger) 日志对象
  129. :param session: (curl_cffi.requests.Session) 复用的会话对象
  130. :param impersonate: (str) curl_cffi 浏览器指纹标识
  131. :param auction_id: (str) "-1"(All Auctions) 或具体 id 如 "12"
  132. :return: None
  133. :raises RuntimeError: 切换后页面中 selected option 与 auction_id 不一致时抛出
  134. """
  135. log.info(f"切换 Auction -> {auction_id}")
  136. proxies = get_proxys(log)
  137. # 1) 首次 GET 拿 ViewState
  138. resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate,
  139. proxies=proxies, timeout=15)
  140. resp.raise_for_status()
  141. sel = Selector(resp.text)
  142. # 2) 构造 postback 表单。控件名前缀均为 ctl00$
  143. form_data = {
  144. '__EVENTTARGET': 'ctl00$Auction',
  145. '__EVENTARGUMENT': '',
  146. '__LASTFOCUS': '',
  147. '__VIEWSTATE': _pick_hidden(sel, '__VIEWSTATE'),
  148. '__VIEWSTATEGENERATOR': _pick_hidden(sel, '__VIEWSTATEGENERATOR'),
  149. '__EVENTVALIDATION': _pick_hidden(sel, '__EVENTVALIDATION'),
  150. 'ctl00$SearchIn': 'title',
  151. 'ctl00$SearchText': '',
  152. 'ctl00$BrowseBy': 'gallery',
  153. 'ctl00$Auction': str(auction_id),
  154. }
  155. post_headers = {
  156. **headers,
  157. 'Content-Type': 'application/x-www-form-urlencoded',
  158. 'Referer': GALLERY_URL,
  159. 'Origin': SITE_ORIGIN,
  160. }
  161. resp = session.post(GALLERY_URL, headers=post_headers, data=form_data,
  162. impersonate=impersonate, proxies=proxies, timeout=20)
  163. resp.raise_for_status()
  164. # 验证切换是否成功
  165. sel2 = Selector(resp.text)
  166. selected_val = sel2.xpath('//select[@id="Auction"]/option[@selected]/@value').get()
  167. log.info(f"切换后 Auction 选中值: {selected_val}")
  168. if selected_val != str(auction_id):
  169. raise RuntimeError(f"切换 Auction 失败,预期 {auction_id} 实际 {selected_val}")
  170. def _clean_price(raw):
  171. """
  172. 清洗列表页的成交/当前价文本,去掉前缀和千分位逗号。
  173. :param raw: (str | None) 形如 "SOLD FOR $1,850" / "CURRENT BID $325"
  174. :return: (str | None) 纯数字字符串(如 "1850" / "325"),输入为空时返回 None
  175. """
  176. if not raw:
  177. return None
  178. price = PRICE_PREFIX_RE.sub('', raw)
  179. return price.replace(',', '').strip() or None
  180. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  181. def get_single_page(log, page, sql_pool, session, impersonate,
  182. auction_id=None, auction_name=None):
  183. """
  184. 抓取并落库 Gallery 的单页数据。
  185. :param log: (loguru.Logger) 日志对象
  186. :param page: (int) 页码(从 1 开始)
  187. :param sql_pool: (MySQLConnectionPool) 数据库连接池,None 时不落库(调试用)
  188. :param session: (curl_cffi.requests.Session) 复用的会话对象(需已 setup_auction_session)
  189. :param impersonate: (str) curl_cffi 浏览器指纹标识
  190. :param auction_id: (str | None) 当前 session 切换到的 auction id,写入 auctions90s_record.auction_id
  191. :param auction_name: (str | None) 同上,写入 auctions90s_record.auction_name
  192. :return: (int) 本页解析到并落库的条数;无数据时返回 0
  193. """
  194. log.info(f">>>>>>>>>>>>>> 正在爬取 auction={auction_id}({auction_name}) 第 {page} 页 <<<<<<<<<<<<<<")
  195. response = session.get(GALLERY_URL, impersonate=impersonate, headers=headers,
  196. params={"page": f"{page}"},
  197. proxies=get_proxys(log), timeout=10, allow_redirects=False)
  198. response.raise_for_status()
  199. selector = Selector(response.text)
  200. tag_div_list = selector.xpath(
  201. '//div[@class="items"]/div/div[@class="row"]//div[@class="col-lg-3 col-md-4 col-sm-6"]')
  202. if not tag_div_list or len(tag_div_list) == 0:
  203. log.warning(f"--------------- 第 {page} 页无数据 ---------------")
  204. return 0
  205. info_list = []
  206. for tag_div in tag_div_list:
  207. # 商品标题与详情页绝对地址(列表里 a 标签已是完整 url,无需拼接)
  208. title = tag_div.xpath('.//p/a/text()').get()
  209. detail_url = tag_div.xpath('.//p/a/@href').get()
  210. # Bids / Opening Bid / Status 三个 strong 顺序固定
  211. tag_div_p = tag_div.xpath('.//div/p[2]/strong/text()').getall()
  212. bids = tag_div_p[0] if tag_div_p else None
  213. opening_bid = tag_div_p[1] if len(tag_div_p) > 1 else None
  214. opening_bid = opening_bid.replace('$', '').replace(',', '').strip() if opening_bid else None
  215. status = tag_div_p[2] if len(tag_div_p) > 2 else None
  216. # 价格:列表卡片底部 a 文本,进行中为 "CURRENT BID $...",结束后为 "SOLD FOR $..."
  217. price = tag_div.xpath('.//div[@class="item-price"]/a/text()').get()
  218. price = _clean_price(price)
  219. data_dict = {
  220. "title": title,
  221. "detail_url": detail_url,
  222. "bids": bids,
  223. "opening_bid": opening_bid,
  224. "status": status,
  225. "price": price,
  226. "auction_id": int(auction_id) if auction_id is not None else None,
  227. "auction_name": auction_name,
  228. }
  229. info_list.append(data_dict)
  230. if info_list and sql_pool is not None:
  231. sql_pool.insert_many(table=TABLE_NAME, data_list=info_list, ignore=True)
  232. return len(info_list)
  233. def crawl_one_auction(log, sql_pool, session, impersonate,
  234. auction_id, auction_name, max_page=460):
  235. """
  236. 抓取单个拍卖会的全部页(switch 到该 auction → 翻页直到无数据)。
  237. :param log: (loguru.Logger) 日志对象
  238. :param sql_pool: (MySQLConnectionPool) 数据库连接池
  239. :param session: (curl_cffi.requests.Session) 复用的会话对象
  240. :param impersonate: (str) curl_cffi 浏览器指纹标识
  241. :param auction_id: (str) 当前 session 切换到的 auction id
  242. :param auction_name: (str) 拍卖会名,写入数据库
  243. :param max_page: (int) 最大页码上限,作为兜底保护
  244. :return: (int) 该 auction 抓到的总条数
  245. """
  246. setup_auction_session(log, session, impersonate, auction_id)
  247. page = 1
  248. total = 0
  249. while page <= max_page:
  250. try:
  251. n = get_single_page(log, page, sql_pool, session, impersonate,
  252. auction_id=auction_id, auction_name=auction_name)
  253. except Exception as e:
  254. log.error(f"auction={auction_id} page={page} 抓取失败: {e}")
  255. break
  256. if n == 0:
  257. log.info(f"auction={auction_id} 翻到第 {page} 页无数据,结束")
  258. break
  259. total += n
  260. page += 1
  261. log.info(f"auction={auction_id}({auction_name}) 共抓取 {total} 条")
  262. return total
  263. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  264. def get_details(log, url, sql_pool, sql_id):
  265. """
  266. 获取详情页:分类、图片列表,写回数据库并把 state 置 1。
  267. :param log: (loguru.Logger) 日志对象
  268. :param url: (str) 详情页 URL(来自列表页 detail_url 字段)
  269. :param sql_pool: (MySQLConnectionPool) 数据库连接池
  270. :param sql_id: (int) 数据库记录主键 id
  271. :return: None
  272. :raises requests.HTTPError: 详情页非 2xx 时由 tenacity 重试,超限后抛出
  273. """
  274. log.info(f">>>>>>>>>>>>>> 正在爬取详情数据URL: {url} <<<<<<<<<<<<<<")
  275. response = requests.get(url, headers=headers,
  276. impersonate=random.choice(client_identifier_list),
  277. timeout=10, proxies=get_proxys(log))
  278. response.raise_for_status()
  279. selector = Selector(response.text)
  280. category = selector.xpath('//a[@id="MainContent_hCategory"]/text()').get()
  281. # 右侧主图 + 缩略图区。缩略图列表里包含主图链接,因此需要去重保序
  282. imgs = selector.xpath('//div[@class="col-md-5 col-sm-5"]//a[not(@id="Zoomer")]/@href').getall()
  283. imgs = list(dict.fromkeys(imgs)) if imgs else []
  284. imgs_str = ','.join(imgs) if imgs else None
  285. sql_pool.update_one_or_dict(
  286. table=TABLE_NAME,
  287. data={"category": category, "imgs": imgs_str, "state": 1},
  288. condition={"id": sql_id}
  289. )
  290. def update_details_for_pending(log, sql_pool):
  291. """
  292. 扫描库里 state != 1 的记录,逐条抓详情;详情失败置 state=2。
  293. :param log: (loguru.Logger) 日志对象
  294. :param sql_pool: (MySQLConnectionPool) 数据库连接池
  295. :return: None
  296. """
  297. log.debug('Updating detail pages ...........................')
  298. sql_result = sql_pool.select_all(f'select id, detail_url from {TABLE_NAME} where state != 1')
  299. for row in sql_result:
  300. sql_id, detail_url = row[0], row[1]
  301. try:
  302. get_details(log, detail_url, sql_pool, sql_id)
  303. except Exception as e:
  304. log.error(f'Error getting details for {detail_url}: {e}')
  305. sql_pool.update_one_or_dict(
  306. table=TABLE_NAME,
  307. data={"state": 2},
  308. condition={"id": sql_id}
  309. )