|
|
@@ -0,0 +1,298 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# Author : Charley
|
|
|
+# Python : 3.12.10
|
|
|
+# Date : 2026/5/21
|
|
|
+"""
|
|
|
+Memory Lane 公用模块:HTTP 配置、ASP.NET postback 切换 auction、单页解析、详情解析。
|
|
|
+被 meml_history.py / meml_spdier.py 复用。
|
|
|
+
|
|
|
+目标网站: https://bid.memorylaneinc.com/lots/gallery
|
|
|
+"""
|
|
|
+import random
|
|
|
+from curl_cffi import requests
|
|
|
+import user_agent
|
|
|
+from loguru import logger
|
|
|
+from parsel import Selector
|
|
|
+from curl_cffi.requests import BrowserType
|
|
|
+from tenacity import retry, stop_after_attempt, wait_fixed
|
|
|
+
|
|
|
+GALLERY_URL = "https://bid.memorylaneinc.com/lots/gallery"
|
|
|
+
|
|
|
+# 直接用库内置的所有浏览器指纹
|
|
|
+client_identifier_list = [b.value for b in BrowserType]
|
|
|
+
|
|
|
+headers = {
|
|
|
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
+ "user-agent": user_agent.generate_user_agent()
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+def after_log(retry_state):
|
|
|
+ """tenacity retry 回调"""
|
|
|
+ if retry_state.args and len(retry_state.args) > 0:
|
|
|
+ log = retry_state.args[0]
|
|
|
+ else:
|
|
|
+ log = logger
|
|
|
+
|
|
|
+ if retry_state.outcome.failed:
|
|
|
+ log.warning(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
|
|
|
+ else:
|
|
|
+ log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
|
|
|
+
|
|
|
+
|
|
|
+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
+def get_proxys(log):
|
|
|
+ """
|
|
|
+ 获取代理
|
|
|
+ :param log: logger 对象
|
|
|
+ :return: 代理字典
|
|
|
+ """
|
|
|
+ http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
|
|
|
+ https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
|
|
|
+ try:
|
|
|
+ return {"http": http_proxy, "https": https_proxy}
|
|
|
+ except Exception as e:
|
|
|
+ log.error(f"Error getting proxy: {e}")
|
|
|
+ raise e
|
|
|
+
|
|
|
+
|
|
|
+def _pick_hidden(selector, field_id):
|
|
|
+ """
|
|
|
+ 从页面提取 ASP.NET 隐藏字段(__VIEWSTATE 等)
|
|
|
+ :param selector: parsel.Selector 对象
|
|
|
+ :param field_id: 隐藏字段的 id,如 __VIEWSTATE
|
|
|
+ :return: 隐藏字段的值,失败返回空字符串
|
|
|
+ """
|
|
|
+ return selector.xpath(f'//input[@id="{field_id}"]/@value').get() or ''
|
|
|
+
|
|
|
+
|
|
|
+def parse_auction_list(selector):
|
|
|
+ """
|
|
|
+ 从 gallery 页面解析所有拍卖会下拉项
|
|
|
+ :param selector: parsel.Selector 对象
|
|
|
+ :return: [{"id": "-1", "name": "All Auctions"}, {"id": "162", "name": "Spring Rarities 2026 Auction"}, ...]
|
|
|
+ """
|
|
|
+ options = selector.xpath('//select[@id="Auction"]/option')
|
|
|
+ result = []
|
|
|
+ for opt in options:
|
|
|
+ aid = opt.xpath('./@value').get()
|
|
|
+ name = opt.xpath('./text()').get()
|
|
|
+ if aid is None:
|
|
|
+ continue
|
|
|
+ result.append({"id": aid.strip(), "name": (name or '').strip()})
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+@retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log)
|
|
|
+def get_auction_list(log, session, impersonate):
|
|
|
+ """
|
|
|
+ GET gallery 首页,解析出全部拍卖会列表(排除 -1 All Auctions)
|
|
|
+ :param log: logger 对象
|
|
|
+ :param session: requests.Session 对象
|
|
|
+ :param impersonate: 浏览器指纹标识(与 setup 时一致)
|
|
|
+ :return: [{"id": "162", "name": "Spring Rarities 2026 Auction"}, ...]
|
|
|
+ """
|
|
|
+ log.info("获取全部拍卖会列表")
|
|
|
+ resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate,
|
|
|
+ proxies=get_proxys(log), timeout=15)
|
|
|
+ resp.raise_for_status()
|
|
|
+ sel = Selector(resp.text)
|
|
|
+ all_opts = parse_auction_list(sel)
|
|
|
+ # 过滤掉 All Auctions(-1),只保留具体拍卖会
|
|
|
+ real = [o for o in all_opts if o["id"] != "-1"]
|
|
|
+ log.info(f"共解析到 {len(real)} 个拍卖会:{[(o['id'], o['name']) for o in real]}")
|
|
|
+ return real
|
|
|
+
|
|
|
+
|
|
|
+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
+def setup_auction_session(log, session, impersonate, auction_id):
|
|
|
+ """
|
|
|
+ 通过 ASP.NET __doPostBack 将 Auction 筛选切换到指定 auction_id。
|
|
|
+ 切换后服务端 session 记住该选择,后续 GET /lots/gallery?page=N 都返回该 auction 数据。
|
|
|
+ :param log: logger 对象
|
|
|
+ :param session: requests.Session 对象
|
|
|
+ :param impersonate: 浏览器指纹标识(与 setup 时一致)
|
|
|
+ :param auction_id: '-1'(All Auctions) 或具体 id 如 '162'
|
|
|
+ """
|
|
|
+ log.info(f"切换 Auction -> {auction_id}")
|
|
|
+ proxies = get_proxys(log)
|
|
|
+
|
|
|
+ # 1) 首次 GET 拿 ViewState
|
|
|
+ resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate,
|
|
|
+ proxies=proxies, timeout=15)
|
|
|
+ resp.raise_for_status()
|
|
|
+ sel = Selector(resp.text)
|
|
|
+
|
|
|
+ form_data = {
|
|
|
+ '__EVENTTARGET': 'ctl00$Auction',
|
|
|
+ '__EVENTARGUMENT': '',
|
|
|
+ '__LASTFOCUS': '',
|
|
|
+ '__VIEWSTATE': _pick_hidden(sel, '__VIEWSTATE'),
|
|
|
+ '__VIEWSTATEGENERATOR': _pick_hidden(sel, '__VIEWSTATEGENERATOR'),
|
|
|
+ '__EVENTVALIDATION': _pick_hidden(sel, '__EVENTVALIDATION'),
|
|
|
+ 'ctl00$SearchIn': 'title',
|
|
|
+ 'ctl00$SearchText': '',
|
|
|
+ 'ctl00$BrowseBy': 'gallery',
|
|
|
+ 'ctl00$Auction': str(auction_id),
|
|
|
+ }
|
|
|
+
|
|
|
+ post_headers = {
|
|
|
+ **headers,
|
|
|
+ 'Content-Type': 'application/x-www-form-urlencoded',
|
|
|
+ 'Referer': GALLERY_URL,
|
|
|
+ 'Origin': 'https://bid.memorylaneinc.com',
|
|
|
+ }
|
|
|
+
|
|
|
+ resp = session.post(GALLERY_URL, headers=post_headers, data=form_data,
|
|
|
+ impersonate=impersonate, proxies=proxies, timeout=20)
|
|
|
+ resp.raise_for_status()
|
|
|
+
|
|
|
+ # 验证切换是否成功
|
|
|
+ sel2 = Selector(resp.text)
|
|
|
+ selected_val = sel2.xpath('//select[@id="Auction"]/option[@selected]/@value').get()
|
|
|
+ log.info(f"切换后 Auction 选中值: {selected_val}")
|
|
|
+ if selected_val != str(auction_id):
|
|
|
+ raise RuntimeError(f"切换 Auction 失败,预期 {auction_id} 实际 {selected_val}")
|
|
|
+
|
|
|
+
|
|
|
+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
+def get_single_page(log, page, sql_pool, session, impersonate,
|
|
|
+ auction_id=None, auction_name=None):
|
|
|
+ """
|
|
|
+ 获取单页数据
|
|
|
+ :param log: logger 对象
|
|
|
+ :param page: 页码
|
|
|
+ :param sql_pool: mysql 连接池
|
|
|
+ :param session: requests.Session 对象
|
|
|
+ :param impersonate: 浏览器指纹标识(与 setup 时一致)
|
|
|
+ :param auction_id: 当前 session 切换到的 auction id,会写入 memory_lane_record.auction_id
|
|
|
+ :param auction_name: 同上,写入 memory_lane_record.auction_name
|
|
|
+ :return: 该页解析到的条数
|
|
|
+ """
|
|
|
+ log.info(f">>>>>>>>>>>>>> 正在爬取 auction={auction_id}({auction_name}) 第 {page} 页 <<<<<<<<<<<<<<")
|
|
|
+
|
|
|
+ response = session.get(GALLERY_URL, impersonate=impersonate, headers=headers,
|
|
|
+ params={"page": f"{page}"},
|
|
|
+ proxies=get_proxys(log), timeout=10, allow_redirects=False)
|
|
|
+ response.raise_for_status()
|
|
|
+
|
|
|
+ selector = Selector(response.text)
|
|
|
+ tag_div_list = selector.xpath(
|
|
|
+ '//div[@class="items"]/div/div[@class="row"]//div[@class="col-lg-3 col-md-4 col-sm-6"]')
|
|
|
+
|
|
|
+ if not tag_div_list or len(tag_div_list) == 0:
|
|
|
+ log.warning(f"--------------- 第 {page} 页无数据 ---------------")
|
|
|
+ return 0
|
|
|
+
|
|
|
+ info_list = []
|
|
|
+ for tag_div in tag_div_list:
|
|
|
+ title = tag_div.xpath('.//p/a/text()').get()
|
|
|
+ detail_url = tag_div.xpath('.//p/a/@href').get()
|
|
|
+
|
|
|
+ tag_div_p = tag_div.xpath('.//div/p[2]/strong/text()').getall()
|
|
|
+ bids = tag_div_p[0] if tag_div_p else None
|
|
|
+ opening_bid = tag_div_p[1] if len(tag_div_p) > 1 else None
|
|
|
+ opening_bid = opening_bid.replace('$', '').replace(',', '').strip() if opening_bid else None
|
|
|
+
|
|
|
+ status = tag_div_p[2] if len(tag_div_p) > 2 else None
|
|
|
+ current_bid = tag_div.xpath('.//div[@class="item-price"]/a/text()').get()
|
|
|
+ current_bid = current_bid.replace('CURRENT BID $', '').replace(',', '').strip() if current_bid else None
|
|
|
+
|
|
|
+ data_dict = {
|
|
|
+ "title": title,
|
|
|
+ "detail_url": detail_url,
|
|
|
+ "bids": bids,
|
|
|
+ "opening_bid": opening_bid,
|
|
|
+ "status": status,
|
|
|
+ "current_bid": current_bid,
|
|
|
+ "auction_id": int(auction_id) if auction_id is not None else None,
|
|
|
+ "auction_name": auction_name,
|
|
|
+ }
|
|
|
+ info_list.append(data_dict)
|
|
|
+
|
|
|
+ if info_list and sql_pool is not None:
|
|
|
+ sql_pool.insert_many(table="memory_lane_record", data_list=info_list, ignore=True)
|
|
|
+ return len(info_list)
|
|
|
+
|
|
|
+
|
|
|
+def crawl_one_auction(log, sql_pool, session, impersonate,
|
|
|
+ auction_id, auction_name, max_page=460):
|
|
|
+ """
|
|
|
+ 抓取单个拍卖会的全部页(switch 到该 auction → 翻页直到无数据)
|
|
|
+ :param log: logger 对象
|
|
|
+ :param sql_pool: mysql 连接池
|
|
|
+ :param session: requests.Session 对象
|
|
|
+ :param impersonate: 浏览器指纹标识(与 setup 时一致)
|
|
|
+ :param auction_id: 当前 session 切换到的 auction id,会写入 memory_lane_record.auction_id
|
|
|
+ :param auction_name: 同上,写入 memory_lane_record.auction_name
|
|
|
+ :param max_page: 最大页码
|
|
|
+ :return: 该 auction 抓到的总条数
|
|
|
+ """
|
|
|
+ setup_auction_session(log, session, impersonate, auction_id)
|
|
|
+
|
|
|
+ page = 1
|
|
|
+ total = 0
|
|
|
+ while page <= max_page:
|
|
|
+ try:
|
|
|
+ n = get_single_page(log, page, sql_pool, session, impersonate,
|
|
|
+ auction_id=auction_id, auction_name=auction_name)
|
|
|
+ except Exception as e:
|
|
|
+ log.error(f"auction={auction_id} page={page} 抓取失败: {e}")
|
|
|
+ break
|
|
|
+ if n == 0:
|
|
|
+ log.info(f"auction={auction_id} 翻到第 {page} 页无数据,结束")
|
|
|
+ break
|
|
|
+ total += n
|
|
|
+ page += 1
|
|
|
+ log.info(f"auction={auction_id}({auction_name}) 共抓取 {total} 条")
|
|
|
+ return total
|
|
|
+
|
|
|
+
|
|
|
+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
+def get_details(log, url, sql_pool, sql_id):
|
|
|
+ """
|
|
|
+ 获取详情页:分类、描述、图片列表,写回数据库
|
|
|
+ :param log: logger 对象
|
|
|
+ :param url: 详情页 URL
|
|
|
+ :param sql_pool: mysql 连接池
|
|
|
+ :param sql_id: 数据库记录 ID
|
|
|
+ """
|
|
|
+ log.info(f">>>>>>>>>>>>>> 正在爬取详情数据URL: {url} <<<<<<<<<<<<<<")
|
|
|
+ response = requests.get(url, headers=headers,
|
|
|
+ impersonate=random.choice(client_identifier_list),
|
|
|
+ timeout=10, proxies=get_proxys(log))
|
|
|
+ response.raise_for_status()
|
|
|
+ selector = Selector(response.text)
|
|
|
+ category = selector.xpath('//a[@id="MainContent_hCategory"]/text()').get()
|
|
|
+ # description = selector.xpath('//*[@id="MainContent_lblOldAuction"]/text()').getall()
|
|
|
+ # description = ' '.join(description).strip() if description else None
|
|
|
+ imgs = selector.xpath('//div[@class="col-md-5 col-sm-5"]//a[not(@id="Zoomer")]/@href').getall()
|
|
|
+ imgs = ','.join(imgs) if imgs else None
|
|
|
+
|
|
|
+ sql_pool.update_one_or_dict(
|
|
|
+ table="memory_lane_record",
|
|
|
+ data={"category": category, "imgs": imgs, "state": 1},
|
|
|
+ condition={"id": sql_id}
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def update_details_for_pending(log, sql_pool):
|
|
|
+ """
|
|
|
+ 扫描库里 state != 1 的记录,逐条抓详情
|
|
|
+ :param log: logger 对象
|
|
|
+ :param sql_pool: mysql 连接池
|
|
|
+ """
|
|
|
+ log.debug('Updating detail pages ...........................')
|
|
|
+ sql_result = sql_pool.select_all(
|
|
|
+ 'select id, detail_url from memory_lane_record where state != 1 order by id')
|
|
|
+ for row in sql_result:
|
|
|
+ sql_id, detail_url = row[0], row[1]
|
|
|
+ try:
|
|
|
+ get_details(log, detail_url, sql_pool, sql_id)
|
|
|
+ except Exception as e:
|
|
|
+ log.error(f'Error getting details for {detail_url}: {e}')
|
|
|
+ sql_pool.update_one_or_dict(
|
|
|
+ table="memory_lane_record",
|
|
|
+ data={"state": 2},
|
|
|
+ condition={"id": sql_id}
|
|
|
+ )
|