|
@@ -2,200 +2,89 @@
|
|
|
# Author : Charley
|
|
# Author : Charley
|
|
|
# Python : 3.12.10
|
|
# Python : 3.12.10
|
|
|
# Date : 2026/5/13 15:54
|
|
# Date : 2026/5/13 15:54
|
|
|
-import random
|
|
|
|
|
|
|
+"""
|
|
|
|
|
+Lelands 增量爬虫(日调度)
|
|
|
|
|
+逻辑:
|
|
|
|
|
+ 1. GET 首页解析当前网站全部 auction id
|
|
|
|
|
+ 2. 查库 select distinct auction_id from lelands_record,得到已爬过的 auction
|
|
|
|
|
+ 3. 差集 = 新增 auction
|
|
|
|
|
+ 4. 没有新增 → 本轮无数据可抓,结束
|
|
|
|
|
+ 5. 对每个新增 auction:postback 切换 → 翻页 → 写库
|
|
|
|
|
+ 6. 补抓 state != 1 的详情页
|
|
|
|
|
+"""
|
|
|
import time
|
|
import time
|
|
|
|
|
+import random
|
|
|
import inspect
|
|
import inspect
|
|
|
import schedule
|
|
import schedule
|
|
|
from curl_cffi import requests
|
|
from curl_cffi import requests
|
|
|
-import user_agent
|
|
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
|
-from parsel import Selector
|
|
|
|
|
-from curl_cffi.requests import BrowserType
|
|
|
|
|
-from mysql_pool import MySQLConnectionPool
|
|
|
|
|
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
|
|
|
|
|
|
-"""
|
|
|
|
|
-目标网站:https://auction.lelands.com/lots/gallery/?page=3
|
|
|
|
|
-"""
|
|
|
|
|
|
|
+from mysql_pool import MySQLConnectionPool
|
|
|
|
|
+from lelands_core import (
|
|
|
|
|
+ client_identifier_list,
|
|
|
|
|
+ crawl_one_auction,
|
|
|
|
|
+ get_auction_list,
|
|
|
|
|
+ update_details_for_pending,
|
|
|
|
|
+ after_log,
|
|
|
|
|
+)
|
|
|
|
|
|
|
|
logger.remove()
|
|
logger.remove()
|
|
|
logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
|
|
logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
|
|
|
format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
|
|
format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
|
|
|
level="DEBUG", retention="7 day")
|
|
level="DEBUG", retention="7 day")
|
|
|
|
|
|
|
|
-# 直接用库内置的所有浏览器类型,不用手动维护列表
|
|
|
|
|
-client_identifier_list = [b.value for b in BrowserType]
|
|
|
|
|
-
|
|
|
|
|
-headers = {
|
|
|
|
|
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
|
|
- # "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
|
|
|
|
|
- "user-agent": user_agent.generate_user_agent()
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def after_log(retry_state):
|
|
|
|
|
- """
|
|
|
|
|
- retry 回调
|
|
|
|
|
- :param retry_state: RetryCallState 对象
|
|
|
|
|
- """
|
|
|
|
|
- # 检查 args 是否存在且不为空
|
|
|
|
|
- if retry_state.args and len(retry_state.args) > 0:
|
|
|
|
|
- log = retry_state.args[0] # 获取传入的 logger
|
|
|
|
|
- else:
|
|
|
|
|
- log = logger # 使用全局 logger
|
|
|
|
|
-
|
|
|
|
|
- if retry_state.outcome.failed:
|
|
|
|
|
- log.warning(
|
|
|
|
|
- f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
|
|
|
|
|
- else:
|
|
|
|
|
- log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
|
|
-def get_proxys(log):
|
|
|
|
|
- http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
|
|
|
|
|
- https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
|
|
|
|
|
-
|
|
|
|
|
- try:
|
|
|
|
|
- proxySettings = {
|
|
|
|
|
- "http": http_proxy,
|
|
|
|
|
- "https": https_proxy,
|
|
|
|
|
- }
|
|
|
|
|
- return proxySettings
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- log.error(f"Error getting proxy: {e}")
|
|
|
|
|
- raise e
|
|
|
|
|
-
|
|
|
|
|
|
|
|
|
|
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
|
|
-def get_details(log, url, sql_pool, sql_id):
|
|
|
|
|
- """
|
|
|
|
|
- 获取详情数据
|
|
|
|
|
- :param log: logger对象
|
|
|
|
|
- :param url: 详情页URL
|
|
|
|
|
- :param sql_pool: MySQL连接池
|
|
|
|
|
- :param sql_id: 数据ID
|
|
|
|
|
- :return: 标题和描述
|
|
|
|
|
- """
|
|
|
|
|
- log.info(f">>>>>>>>>>>>>> 正在爬取详情数据URL: {url} <<<<<<<<<<<<<<")
|
|
|
|
|
- # url = 'https://auction.lelands.com/bids/bidplace?itemid=133680'
|
|
|
|
|
- response = requests.get(url, headers=headers, impersonate=random.choice(client_identifier_list), timeout=10,
|
|
|
|
|
- proxies=get_proxys(log))
|
|
|
|
|
- response.raise_for_status()
|
|
|
|
|
- selector = Selector(response.text)
|
|
|
|
|
- category = selector.xpath('//a[@id="MainContent_hCategory"]/text()').get()
|
|
|
|
|
- description = selector.xpath('//*[@id="MainContent_lblOldAuction"]/text()').getall()
|
|
|
|
|
- description = ' '.join(description).strip() if description else None
|
|
|
|
|
- # imgs = selector.xpath('//div[@class="col-md-5 col-sm-5"]//a[@class="MagicThumb-swap"]/@href').getall()
|
|
|
|
|
- imgs = selector.xpath('//div[@class="col-md-5 col-sm-5"]//a[not(@id="Zoomer")]/@href').getall()
|
|
|
|
|
- imgs = ','.join(imgs) if imgs else None
|
|
|
|
|
- # print(category, description, imgs)
|
|
|
|
|
-
|
|
|
|
|
- # 更新数据和状态
|
|
|
|
|
- sql_pool.update_one_or_dict(
|
|
|
|
|
- table="lelands_record",
|
|
|
|
|
- data={"category": category, "description": description, "imgs": imgs, "state": 1},
|
|
|
|
|
- condition={"id": sql_id}
|
|
|
|
|
|
|
+def get_existing_auction_ids(log, sql_pool):
|
|
|
|
|
+ """查库返回已爬过的 auction_id 集合"""
|
|
|
|
|
+ rows = sql_pool.select_all(
|
|
|
|
|
+ "select distinct auction_id from lelands_record where auction_id is not null"
|
|
|
)
|
|
)
|
|
|
|
|
+ ids = {str(r[0]) for r in rows} if rows else set()
|
|
|
|
|
+ log.info(f"库中已存在 {len(ids)} 个 auction_id: {sorted(ids)}")
|
|
|
|
|
+ return ids
|
|
|
|
|
|
|
|
|
|
|
|
|
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
|
|
-def get_single_page(log, page, sql_pool):
|
|
|
|
|
- """
|
|
|
|
|
- 获取单页数据
|
|
|
|
|
- :param log: logger对象
|
|
|
|
|
- :param page: 页码
|
|
|
|
|
- :param sql_pool: MySQL连接池
|
|
|
|
|
- :return: 该页数据条数
|
|
|
|
|
- """
|
|
|
|
|
- log.info(f">>>>>>>>>>>>>> 正在爬取第 {page} 页数据 <<<<<<<<<<<<<<")
|
|
|
|
|
-
|
|
|
|
|
- url = "https://auction.lelands.com/lots/gallery/"
|
|
|
|
|
- params = {
|
|
|
|
|
- # "page": "1"
|
|
|
|
|
- "page": f"{page}"
|
|
|
|
|
- }
|
|
|
|
|
- # response = requests.get(url, headers=headers, params=params, timeout=22, proxies=get_proxys(log))
|
|
|
|
|
- with requests.Session() as session:
|
|
|
|
|
- response = session.get(url, impersonate=random.choice(client_identifier_list), headers=headers, params=params,
|
|
|
|
|
- proxies=get_proxys(log), timeout=10, allow_redirects=False)
|
|
|
|
|
- # print(response.text)
|
|
|
|
|
- # print(response)
|
|
|
|
|
- response.raise_for_status()
|
|
|
|
|
-
|
|
|
|
|
- selector = Selector(response.text)
|
|
|
|
|
- # 实际加载内容有变化,需要调整 XPath 表达式
|
|
|
|
|
- # tag_div_list = selector.xpath('//div[@class="col-md-9 col-sm-9"]/div[@class="col-lg-3 col-md-4 col-sm-6"]')
|
|
|
|
|
- tag_div_list = selector.xpath(
|
|
|
|
|
- '//div[@class="items"]/div/div[@class="row"]//div[@class="col-lg-3 col-md-4 col-sm-6"]')
|
|
|
|
|
- # print('tag_div_list:', tag_div_list)
|
|
|
|
|
- info_list = []
|
|
|
|
|
- for tag_div in tag_div_list:
|
|
|
|
|
- title = tag_div.xpath('.//p/a/text()').get()
|
|
|
|
|
- detail_url = tag_div.xpath('.//p/a/@href').get()
|
|
|
|
|
- # img = tag_div.xpath('.//div[@class="item-image"]/a/img/@src').get()
|
|
|
|
|
-
|
|
|
|
|
- tag_div_p = tag_div.xpath('.//div/p[2]/strong/text()').getall()
|
|
|
|
|
- bids = tag_div_p[0] if tag_div_p else None
|
|
|
|
|
- opening_bid = tag_div_p[1] if len(tag_div_p) > 1 else None
|
|
|
|
|
- opening_bid = opening_bid.replace('$', '').replace(',', '').strip() if opening_bid else None
|
|
|
|
|
-
|
|
|
|
|
- status = tag_div_p[2] if len(tag_div_p) > 2 else None
|
|
|
|
|
- price = tag_div.xpath('.//div[@class="item-price"]/a/text()').get()
|
|
|
|
|
- price = price.replace('SOLD FOR $', '').replace(',', '').strip() if price else None
|
|
|
|
|
-
|
|
|
|
|
- data_dict = {
|
|
|
|
|
- "title": title,
|
|
|
|
|
- "detail_url": detail_url,
|
|
|
|
|
- # "img": img,
|
|
|
|
|
- "bids": bids,
|
|
|
|
|
- "opening_bid": opening_bid,
|
|
|
|
|
- "status": status,
|
|
|
|
|
- "price": price
|
|
|
|
|
- }
|
|
|
|
|
- # print('data_dict:', data_dict)
|
|
|
|
|
- info_list.append(data_dict)
|
|
|
|
|
-
|
|
|
|
|
- # 保存数据到数据库
|
|
|
|
|
- if info_list:
|
|
|
|
|
- sql_pool.insert_many(table="lelands_record", data_list=info_list, ignore=True)
|
|
|
|
|
- return len(info_list)
|
|
|
|
|
-
|
|
|
|
|
|
|
+def diff_new_auctions(log, all_auctions, existing_ids):
|
|
|
|
|
+ """从首页解析的全部 auctions 中筛出库里没有的"""
|
|
|
|
|
+ new_list = [a for a in all_auctions if a["id"] not in existing_ids]
|
|
|
|
|
+ log.info(f"新增待抓取 auction 数: {len(new_list)} -> {[(a['id'], a['name']) for a in new_list]}")
|
|
|
|
|
+ return new_list
|
|
|
|
|
|
|
|
-def get_sold_list(log, sql_pool):
|
|
|
|
|
- """
|
|
|
|
|
- 获取已售列表
|
|
|
|
|
- :param log: logger对象
|
|
|
|
|
- :param sql_pool: MySQL连接池
|
|
|
|
|
- :return: 无
|
|
|
|
|
- """
|
|
|
|
|
- page = 1
|
|
|
|
|
- max_page = 10
|
|
|
|
|
|
|
|
|
|
- while page <= max_page:
|
|
|
|
|
|
|
+def run_incremental(log, sql_pool):
|
|
|
|
|
+ """增量抓取主流程"""
|
|
|
|
|
+ impersonate = random.choice(client_identifier_list)
|
|
|
|
|
+ with requests.Session() as session:
|
|
|
try:
|
|
try:
|
|
|
- len_list = get_single_page(log, page, sql_pool)
|
|
|
|
|
|
|
+ all_auctions = get_auction_list(log, session, impersonate)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- log.error(f"Error getting page {page}: {e}")
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ log.error(f"获取拍卖会列表失败: {e}")
|
|
|
|
|
+ return
|
|
|
|
|
|
|
|
- if len_list == 0:
|
|
|
|
|
- log.warning(f"No data on page {page}, stopping further requests")
|
|
|
|
|
- break
|
|
|
|
|
|
|
+ existing_ids = get_existing_auction_ids(log, sql_pool)
|
|
|
|
|
+ new_auctions = diff_new_auctions(log, all_auctions, existing_ids)
|
|
|
|
|
|
|
|
- page += 1
|
|
|
|
|
|
|
+ if not new_auctions:
|
|
|
|
|
+ log.info("本轮无新增 auction,跳过 list 抓取")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ for idx, auc in enumerate(new_auctions, 1):
|
|
|
|
|
+ aid, name = auc["id"], auc["name"]
|
|
|
|
|
+ log.info(f"========== [{idx}/{len(new_auctions)}] 开始抓 auction={aid} ({name}) ==========")
|
|
|
|
|
+ try:
|
|
|
|
|
+ crawl_one_auction(log, sql_pool, session, impersonate,
|
|
|
|
|
+ auction_id=aid, auction_name=name)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ log.error(f"auction={aid} 抓取异常: {e}")
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
|
|
@retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
|
|
|
def lds_main(log):
|
|
def lds_main(log):
|
|
|
- """
|
|
|
|
|
- 主函数
|
|
|
|
|
- :param log: logger对象
|
|
|
|
|
- """
|
|
|
|
|
- log.info(
|
|
|
|
|
- f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
|
|
|
|
|
|
|
+ """日调度主函数:增量 list + 补详情"""
|
|
|
|
|
+ log.info(f'开始运行 {inspect.currentframe().f_code.co_name} 增量爬虫任务 ...')
|
|
|
|
|
|
|
|
- # 配置 MySQL 连接池
|
|
|
|
|
sql_pool = MySQLConnectionPool(log=log)
|
|
sql_pool = MySQLConnectionPool(log=log)
|
|
|
if not sql_pool:
|
|
if not sql_pool:
|
|
|
log.error("MySQL数据库连接失败")
|
|
log.error("MySQL数据库连接失败")
|
|
@@ -203,42 +92,32 @@ def lds_main(log):
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
try:
|
|
try:
|
|
|
- get_sold_list(log, sql_pool)
|
|
|
|
|
|
|
+ run_incremental(log, sql_pool)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- log.error(f'Error getting sold list: {e}')
|
|
|
|
|
|
|
+ log.error(f'增量抓取失败: {e}')
|
|
|
|
|
|
|
|
- # 更新详情页
|
|
|
|
|
- log.debug('Updating detail pages........................... started')
|
|
|
|
|
- # sql_result = sql_pool.select_all('select id, detail_url from lelands_record where state = 0')
|
|
|
|
|
- sql_result = sql_pool.select_all('select id, detail_url from lelands_record where state != 1')
|
|
|
|
|
- # sql_result = sql_pool.select_all('select id, detail_url from lelands_record where imgs is null')
|
|
|
|
|
- for row in sql_result:
|
|
|
|
|
- sql_id = row[0]
|
|
|
|
|
- detail_url = row[1]
|
|
|
|
|
- try:
|
|
|
|
|
- get_details(log, detail_url, sql_pool, sql_id)
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- log.error(f'Error getting details for {detail_url}: {e}')
|
|
|
|
|
- # 更新数据和状态
|
|
|
|
|
- sql_pool.update_one_or_dict(
|
|
|
|
|
- table="lelands_record",
|
|
|
|
|
- data={"state": 2},
|
|
|
|
|
- condition={"id": sql_id}
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ update_details_for_pending(log, sql_pool)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ log.error(f'详情补抓失败: {e}')
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
|
|
log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
|
|
|
finally:
|
|
finally:
|
|
|
- log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
|
|
|
|
|
|
|
+ log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮采集 ...')
|
|
|
|
|
|
|
|
|
|
|
|
|
def schedule_task():
|
|
def schedule_task():
|
|
|
- """
|
|
|
|
|
- 设置定时任务
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ """每半个月 跑一次增量"""
|
|
|
lds_main(log=logger)
|
|
lds_main(log=logger)
|
|
|
|
|
|
|
|
- schedule.every().day.at("05:00").do(lds_main, log=logger)
|
|
|
|
|
|
|
+ def run_semimonthly():
|
|
|
|
|
+ # 每月 1 号和 15 号执行(半月一次)
|
|
|
|
|
+ from datetime import date
|
|
|
|
|
+ if date.today().day in (1, 15):
|
|
|
|
|
+ lds_main(log=logger)
|
|
|
|
|
+
|
|
|
|
|
+ schedule.every().day.at("05:00").do(run_semimonthly)
|
|
|
while True:
|
|
while True:
|
|
|
schedule.run_pending()
|
|
schedule.run_pending()
|
|
|
time.sleep(1)
|
|
time.sleep(1)
|
|
@@ -247,21 +126,3 @@ def schedule_task():
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
# lds_main(log=logger)
|
|
# lds_main(log=logger)
|
|
|
schedule_task()
|
|
schedule_task()
|
|
|
- # get_single_page(log=logger, page=1, sql_pool=None)
|
|
|
|
|
- # get_details(logger, "https://auction.lelands.com/bids/bidplace?itemid=133680", sql_pool=None, sql_id=None)
|
|
|
|
|
- """
|
|
|
|
|
- ['https://auction.lelands.com/images_items/item_133680_1_488429.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_1_488429.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_2_488430.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_3_488431.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_4_488432.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_5_488433.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_6_488434.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_7_488435.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_8_488436.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_9_488437.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_10_488438.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_11_488439.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_12_488440.jpg',
|
|
|
|
|
- 'https://auction.lelands.com/images_items/item_133680_13_488441.jpg']
|
|
|
|
|
- """
|
|
|