lelands_core.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.12.10
  4. # Date : 2026/5/21
  5. """
  6. Lelands 公用模块:HTTP 配置、ASP.NET postback 切换 auction、单页解析、详情解析。
  7. 被 lelands_history.py / lelands_spider.py 复用。
  8. """
  9. import random
  10. import re
  11. import user_agent
  12. from loguru import logger
  13. from parsel import Selector
  14. from curl_cffi import requests
  15. from curl_cffi.requests import BrowserType
  16. from tenacity import retry, stop_after_attempt, wait_fixed
  17. GALLERY_URL = "https://auction.lelands.com/lots/gallery/"
  18. # 直接用库内置的所有浏览器指纹
  19. client_identifier_list = [b.value for b in BrowserType]
  20. headers = {
  21. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  22. "user-agent": user_agent.generate_user_agent()
  23. }
  24. def after_log(retry_state):
  25. """tenacity retry 回调"""
  26. if retry_state.args and len(retry_state.args) > 0:
  27. log = retry_state.args[0]
  28. else:
  29. log = logger
  30. if retry_state.outcome.failed:
  31. log.warning(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  32. else:
  33. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  34. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  35. def get_proxys(log):
  36. """
  37. 获取代理
  38. :param log: logger 对象
  39. :return: 代理字典
  40. """
  41. http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  42. https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  43. try:
  44. return {"http": http_proxy, "https": https_proxy}
  45. except Exception as e:
  46. log.error(f"Error getting proxy: {e}")
  47. raise e
  48. def extract_auction(description, log=logger):
  49. """
  50. 从 description 列表中提取 Auction 字段(双引号内的内容)
  51. :param description: selector.getall() 返回的字符串列表
  52. :param log: logger 对象
  53. :return: 提取到的 auction 字符串,失败返回 None
  54. """
  55. try:
  56. if not description or not isinstance(description, list):
  57. return None
  58. for item in description:
  59. if not item or not isinstance(item, str):
  60. continue
  61. text = item.strip()
  62. if not text:
  63. continue
  64. match = re.search(r'"(.+?)"', text)
  65. if match:
  66. auction = match.group(1).strip()
  67. return auction if auction else None
  68. return None
  69. except Exception as e:
  70. log.error(f"extract_auction error: {e}")
  71. return None
  72. def _pick_hidden(selector, field_id):
  73. """
  74. 从页面提取 ASP.NET 隐藏字段(__VIEWSTATE 等)
  75. :param selector: parsel.Selector 对象
  76. :param field_id: 隐藏字段的 id,如 __VIEWSTATE
  77. :return: 隐藏字段的值,失败返回空字符串
  78. """
  79. return selector.xpath(f'//input[@id="{field_id}"]/@value').get() or ''
  80. def parse_auction_list(selector):
  81. """
  82. 从 gallery 页面解析所有拍卖会下拉项
  83. :param selector: parsel.Selector 对象
  84. :return: [{"id": "-1", "name": "All Auctions"}, {"id": "1005", "name": "2026 Spring Classic"}, ...]
  85. """
  86. options = selector.xpath('//select[@id="Auction"]/option')
  87. result = []
  88. for opt in options:
  89. aid = opt.xpath('./@value').get()
  90. name = opt.xpath('./text()').get()
  91. if aid is None:
  92. continue
  93. result.append({"id": aid.strip(), "name": (name or '').strip()})
  94. return result
  95. @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), after=after_log)
  96. def get_auction_list(log, session, impersonate):
  97. """
  98. GET gallery 首页,解析出全部拍卖会列表(排除 -1 All Auctions)
  99. :param log: logger 对象
  100. :param session: requests.Session 对象
  101. :param impersonate: 浏览器指纹标识(与 setup 时一致)
  102. :return: [{"id": "1005", "name": "2026 Spring Classic"}, ...]
  103. """
  104. log.info("获取全部拍卖会列表")
  105. resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate,
  106. proxies=get_proxys(log), timeout=15)
  107. resp.raise_for_status()
  108. sel = Selector(resp.text)
  109. all_opts = parse_auction_list(sel)
  110. # 过滤掉 All Auctions(-1),只保留具体拍卖会
  111. real = [o for o in all_opts if o["id"] != "-1"]
  112. log.info(f"共解析到 {len(real)} 个拍卖会:{[(o['id'], o['name']) for o in real]}")
  113. return real
  114. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  115. def setup_auction_session(log, session, impersonate, auction_id):
  116. """
  117. 通过 ASP.NET __doPostBack 将 Auction 筛选切换到指定 auction_id。
  118. 切换后服务端 session 记住该选择,后续 GET /lots/gallery?page=N 都返回该 auction 数据。
  119. :param log: logger 对象
  120. :param session: requests.Session 对象
  121. :param impersonate: 浏览器指纹标识(与 setup 时一致)
  122. :param auction_id: '-1'(All Auctions) 或具体 id 如 '1005'
  123. """
  124. log.info(f"切换 Auction -> {auction_id}")
  125. proxies = get_proxys(log)
  126. # 1) 首次 GET 拿 ViewState
  127. resp = session.get(GALLERY_URL, headers=headers, impersonate=impersonate,
  128. proxies=proxies, timeout=15)
  129. resp.raise_for_status()
  130. sel = Selector(resp.text)
  131. form_data = {
  132. '__EVENTTARGET': 'ctl00$Auction',
  133. '__EVENTARGUMENT': '',
  134. '__LASTFOCUS': '',
  135. '__VIEWSTATE': _pick_hidden(sel, '__VIEWSTATE'),
  136. '__VIEWSTATEGENERATOR': _pick_hidden(sel, '__VIEWSTATEGENERATOR'),
  137. '__EVENTVALIDATION': _pick_hidden(sel, '__EVENTVALIDATION'),
  138. 'ctl00$SearchIn': 'title',
  139. 'ctl00$SearchText': '',
  140. 'ctl00$BrowseBy': 'gallery',
  141. 'ctl00$Auction': str(auction_id),
  142. }
  143. post_headers = {
  144. **headers,
  145. 'Content-Type': 'application/x-www-form-urlencoded',
  146. 'Referer': GALLERY_URL,
  147. 'Origin': 'https://auction.lelands.com',
  148. }
  149. resp = session.post(GALLERY_URL, headers=post_headers, data=form_data,
  150. impersonate=impersonate, proxies=proxies, timeout=20)
  151. resp.raise_for_status()
  152. # 验证切换是否成功
  153. sel2 = Selector(resp.text)
  154. selected_val = sel2.xpath('//select[@id="Auction"]/option[@selected]/@value').get()
  155. log.info(f"切换后 Auction 选中值: {selected_val}")
  156. if selected_val != str(auction_id):
  157. raise RuntimeError(f"切换 Auction 失败,预期 {auction_id} 实际 {selected_val}")
  158. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  159. def get_single_page(log, page, sql_pool, session, impersonate,
  160. auction_id=None, auction_name=None):
  161. """
  162. 获取单页数据
  163. :param log: logger 对象
  164. :param page: 页码
  165. :param sql_pool: mysql连接池
  166. :param session: requests.Session 对象
  167. :param impersonate: 浏览器指纹标识(与 setup 时一致)
  168. :param auction_id: 当前 session 切换到的 auction id,会写入 lelands_record.auction_id
  169. :param auction_name: 同上,写入 lelands_record.auction_name
  170. :return: 该页解析到的条数
  171. """
  172. log.info(f">>>>>>>>>>>>>> 正在爬取 auction={auction_id}({auction_name}) 第 {page} 页 <<<<<<<<<<<<<<")
  173. response = session.get(GALLERY_URL, impersonate=impersonate, headers=headers,
  174. params={"page": f"{page}"},
  175. proxies=get_proxys(log), timeout=10, allow_redirects=False)
  176. response.raise_for_status()
  177. selector = Selector(response.text)
  178. tag_div_list = selector.xpath(
  179. '//div[@class="items"]/div/div[@class="row"]//div[@class="col-lg-3 col-md-4 col-sm-6"]')
  180. if not tag_div_list or len(tag_div_list) == 0:
  181. log.warning(f"--------------- 第 {page} 页无数据 ---------------")
  182. return 0
  183. info_list = []
  184. for tag_div in tag_div_list:
  185. title = tag_div.xpath('.//p/a/text()').get()
  186. detail_url = tag_div.xpath('.//p/a/@href').get()
  187. tag_div_p = tag_div.xpath('.//div/p[2]/strong/text()').getall()
  188. bids = tag_div_p[0] if tag_div_p else None
  189. opening_bid = tag_div_p[1] if len(tag_div_p) > 1 else None
  190. opening_bid = opening_bid.replace('$', '').replace(',', '').strip() if opening_bid else None
  191. status = tag_div_p[2] if len(tag_div_p) > 2 else None
  192. price = tag_div.xpath('.//div[@class="item-price"]/a/text()').get()
  193. price = price.replace('SOLD FOR $', '').replace(',', '').strip() if price else None
  194. data_dict = {
  195. "title": title,
  196. "detail_url": detail_url,
  197. "bids": bids,
  198. "opening_bid": opening_bid,
  199. "status": status,
  200. "price": price,
  201. "auction_id": int(auction_id) if auction_id is not None else None,
  202. "auction_name": auction_name,
  203. }
  204. info_list.append(data_dict)
  205. if info_list and sql_pool is not None:
  206. sql_pool.insert_many(table="lelands_record", data_list=info_list, ignore=True)
  207. return len(info_list)
  208. def crawl_one_auction(log, sql_pool, session, impersonate,
  209. auction_id, auction_name, max_page=460):
  210. """
  211. 抓取单个拍卖会的全部页(switch 到该 auction → 翻页直到无数据)
  212. :param log: logger 对象
  213. :param sql_pool: mysql连接池
  214. :param session: requests.Session 对象
  215. :param impersonate: 浏览器指纹标识(与 setup 时一致)
  216. :param auction_id: 当前 session 切换到的 auction id,会写入 lelands_record.auction_id
  217. :param auction_name: 同上,写入 lelands_record.auction_name
  218. :param max_page: 最大页码
  219. :return: 该 auction 抓到的总条数
  220. """
  221. setup_auction_session(log, session, impersonate, auction_id)
  222. page = 1
  223. total = 0
  224. while page <= max_page:
  225. try:
  226. n = get_single_page(log, page, sql_pool, session, impersonate,
  227. auction_id=auction_id, auction_name=auction_name)
  228. except Exception as e:
  229. log.error(f"auction={auction_id} page={page} 抓取失败: {e}")
  230. break
  231. if n == 0:
  232. log.info(f"auction={auction_id} 翻到第 {page} 页无数据,结束")
  233. break
  234. total += n
  235. page += 1
  236. log.info(f"auction={auction_id}({auction_name}) 共抓取 {total} 条")
  237. return total
  238. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  239. def get_details(log, url, sql_pool, sql_id):
  240. """
  241. 获取详情页:分类、auction 名称、图片列表,写回数据库
  242. :param log: logger 对象
  243. :param url: 详情页 URL
  244. :param sql_pool: mysql连接池
  245. :param sql_id: 数据库记录 ID
  246. """
  247. log.info(f">>>>>>>>>>>>>> 正在爬取详情数据URL: {url} <<<<<<<<<<<<<<")
  248. response = requests.get(url, headers=headers,
  249. impersonate=random.choice(client_identifier_list),
  250. timeout=10, proxies=get_proxys(log))
  251. response.raise_for_status()
  252. selector = Selector(response.text)
  253. category = selector.xpath('//a[@id="MainContent_hCategory"]/text()').get()
  254. # description = selector.xpath('//*[@id="MainContent_lblOldAuction"]/text()').getall()
  255. # auction = extract_auction(description, log)
  256. imgs = selector.xpath('//div[@class="col-md-5 col-sm-5"]//a[not(@id="Zoomer")]/@href').getall()
  257. imgs = ','.join(imgs) if imgs else None
  258. sql_pool.update_one_or_dict(
  259. table="lelands_record",
  260. data={"category": category, "imgs": imgs, "state": 1},
  261. condition={"id": sql_id}
  262. )
  263. def update_details_for_pending(log, sql_pool):
  264. """
  265. 扫描库里 state != 1 的记录,逐条抓详情
  266. :param log: logger 对象
  267. :param sql_pool: mysql连接池
  268. """
  269. log.debug('Updating detail pages ...........................')
  270. sql_result = sql_pool.select_all('select id, detail_url from lelands_record where state != 1')
  271. for row in sql_result:
  272. sql_id, detail_url = row[0], row[1]
  273. try:
  274. get_details(log, detail_url, sql_pool, sql_id)
  275. except Exception as e:
  276. log.error(f'Error getting details for {detail_url}: {e}')
  277. sql_pool.update_one_or_dict(
  278. table="lelands_record",
  279. data={"state": 2},
  280. condition={"id": sql_id}
  281. )