|
|
@@ -7,20 +7,24 @@ import inspect
|
|
|
import schedule
|
|
|
import requests
|
|
|
from loguru import logger
|
|
|
+
|
|
|
+from dpop_generator import DPoPGenerator
|
|
|
from mysql_pool import MySQLConnectionPool
|
|
|
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
|
|
|
|
"""
|
|
|
请求网址:
|
|
|
https://jp.mercari.com/search?category_id=82&page_token=v1%3A1&status=sold_out%7Ctrading
|
|
|
+
|
|
|
+20260514 -> 只拿列表页 正面图
|
|
|
"""
|
|
|
|
|
|
SEARCH_URL = "https://api.mercari.jp/v2/entities:search"
|
|
|
PAGE_SIZE = 120
|
|
|
LAPLACE_DEVICE_UUID = "a00429c5-ad26-4be4-83ae-60b7239e14d5"
|
|
|
SEARCH_SESSION_ID = "cfba38acec8cae78136c62441bbb267a"
|
|
|
-LIST_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYyMTksImp0aSI6IjQ0YmM4MzZlLWFiYWEtNDI1OC1hMjQ4LTNlNjkxMTUzZjY2NSIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvdjIvZW50aXRpZXM6c2VhcmNoIiwiaHRtIjoiUE9TVCIsInV1aWQiOiJhMDA0MjljNS1hZDI2LTRiZTQtODNhZS02MGI3MjM5ZTE0ZDUifQ.KqYWvIC42NYjNTewIfttuPMFHYAwJ4JZIXn4ulQye6s9c5zQutabWoOp8sKDjy-zvmbDCYA-6K7e7dW3bVu3cw"
|
|
|
-DETAIL_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYwMDksImp0aSI6IjFmNGYwNDlhLTdmMGYtNGM0Zi1hZjcxLTIwYmFhZDhhMTc4NCIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvaXRlbXMvZ2V0IiwiaHRtIjoiR0VUIiwidXVpZCI6ImEwMDQyOWM1LWFkMjYtNGJlNC04M2FlLTYwYjcyMzllMTRkNSJ9._92fashFF1PmC0Ol0HFqz9rIYdzL-w_ZJwXXRTI3zX_8oNP_ziNUIwySB50Itgp88vsgy8skp4DZ2DTd3WBWnQ"
|
|
|
+# LIST_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYyMTksImp0aSI6IjQ0YmM4MzZlLWFiYWEtNDI1OC1hMjQ4LTNlNjkxMTUzZjY2NSIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvdjIvZW50aXRpZXM6c2VhcmNoIiwiaHRtIjoiUE9TVCIsInV1aWQiOiJhMDA0MjljNS1hZDI2LTRiZTQtODNhZS02MGI3MjM5ZTE0ZDUifQ.KqYWvIC42NYjNTewIfttuPMFHYAwJ4JZIXn4ulQye6s9c5zQutabWoOp8sKDjy-zvmbDCYA-6K7e7dW3bVu3cw"
|
|
|
+# DETAIL_DPOP = "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYwMDksImp0aSI6IjFmNGYwNDlhLTdmMGYtNGM0Zi1hZjcxLTIwYmFhZDhhMTc4NCIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvaXRlbXMvZ2V0IiwiaHRtIjoiR0VUIiwidXVpZCI6ImEwMDQyOWM1LWFkMjYtNGJlNC04M2FlLTYwYjcyMzllMTRkNSJ9._92fashFF1PmC0Ol0HFqz9rIYdzL-w_ZJwXXRTI3zX_8oNP_ziNUIwySB50Itgp88vsgy8skp4DZ2DTd3WBWnQ"
|
|
|
|
|
|
logger.remove()
|
|
|
logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
|
|
|
@@ -46,21 +50,20 @@ def after_log(retry_state):
|
|
|
log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
|
|
|
|
|
|
|
|
|
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
|
|
|
+@retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
|
|
|
def get_proxys(log):
|
|
|
- """
|
|
|
- 获取代理
|
|
|
- :return: 代理
|
|
|
- """
|
|
|
- tunnel = "x371.kdltps.com:15818"
|
|
|
- kdl_username = "t13753103189895"
|
|
|
- kdl_password = "o0yefv6z"
|
|
|
+ # 已购买账户 北美
|
|
|
+ # http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
|
|
|
+ # https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
|
|
|
+ http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
|
|
|
+ https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
|
|
|
+
|
|
|
try:
|
|
|
- proxies = {
|
|
|
- "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
|
|
|
- "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
|
|
|
+ proxySettings = {
|
|
|
+ "http": http_proxy,
|
|
|
+ "https": https_proxy,
|
|
|
}
|
|
|
- return proxies
|
|
|
+ return proxySettings
|
|
|
except Exception as e:
|
|
|
log.error(f"Error getting proxy: {e}")
|
|
|
raise e
|
|
|
@@ -68,11 +71,15 @@ def get_proxys(log):
|
|
|
|
|
|
def build_headers() -> dict:
|
|
|
"""构造 Mercari 搜索接口请求头。"""
|
|
|
+ gen = DPoPGenerator()
|
|
|
+ list_token = gen.generate(
|
|
|
+ htu="https://api.mercari.jp/v2/entities:search", htm="POST"
|
|
|
+ )
|
|
|
return {
|
|
|
"accept": "application/json, text/plain, */*",
|
|
|
"accept-language": "ja",
|
|
|
"content-type": "application/json",
|
|
|
- "dpop": LIST_DPOP,
|
|
|
+ "dpop": list_token,
|
|
|
# "dpop": "eyJ0eXAiOiJkcG9wK2p3dCIsImFsZyI6IkVTMjU2IiwiandrIjp7ImNydiI6IlAtMjU2Iiwia3R5IjoiRUMiLCJ4IjoiajlJNmtMS2VrZFNOZEh5SHNhWmw1Z2tiYkZoRGFBUDNEd3N1dlZqQ3JXZyIsInkiOiJOTHREa2RkWVZhZkZ5a1FHYmsteDZBYUp6QWpVblZlcFl0X2pzdmV3cGdJIn19.eyJpYXQiOjE3NzgwNDYyMTksImp0aSI6IjQ0YmM4MzZlLWFiYWEtNDI1OC1hMjQ4LTNlNjkxMTUzZjY2NSIsImh0dSI6Imh0dHBzOi8vYXBpLm1lcmNhcmkuanAvdjIvZW50aXRpZXM6c2VhcmNoIiwiaHRtIjoiUE9TVCIsInV1aWQiOiJhMDA0MjljNS1hZDI2LTRiZTQtODNhZS02MGI3MjM5ZTE0ZDUifQ.KqYWvIC42NYjNTewIfttuPMFHYAwJ4JZIXn4ulQye6s9c5zQutabWoOp8sKDjy-zvmbDCYA-6K7e7dW3bVu3cw",
|
|
|
"origin": "https://jp.mercari.com",
|
|
|
"priority": "u=1, i",
|
|
|
@@ -187,53 +194,13 @@ def fetch_page(
|
|
|
SEARCH_URL,
|
|
|
headers=build_headers(),
|
|
|
json=build_payload(page_token=page_token, category_id=category_id),
|
|
|
- timeout=timeout
|
|
|
+ timeout=timeout,
|
|
|
+ # proxies=get_proxys(log),
|
|
|
)
|
|
|
response.raise_for_status()
|
|
|
return response
|
|
|
|
|
|
|
|
|
-@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
|
|
|
-def get_detail_page(log, pid):
|
|
|
- """
|
|
|
- 获取商品详情。
|
|
|
- :param log: logger对象
|
|
|
- :param pid: 商品ID
|
|
|
- """
|
|
|
- log.info(f"获取商品详情 {pid}............")
|
|
|
- headers = {
|
|
|
- "accept": "application/json, text/plain, */*",
|
|
|
- # "accept-language": "ja",
|
|
|
- "dpop": DETAIL_DPOP,
|
|
|
- # "referer": "https://jp.mercari.com/",
|
|
|
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36",
|
|
|
- "x-platform": "web"
|
|
|
- }
|
|
|
- url = "https://api.mercari.jp/items/get"
|
|
|
- params = {
|
|
|
- # "id": "m69042262006",
|
|
|
- "id": pid,
|
|
|
- "include_item_attributes": "true",
|
|
|
- "include_product_page_component": "true",
|
|
|
- "include_non_ui_item_attributes": "true",
|
|
|
- "include_donation": "true",
|
|
|
- "include_item_attributes_sections": "true",
|
|
|
- "include_auction": "true",
|
|
|
- "country_code": "JP"
|
|
|
- }
|
|
|
- response = requests.get(url, headers=headers, params=params, timeout=22)
|
|
|
- response.raise_for_status()
|
|
|
- resp_json = response.json()
|
|
|
- data = resp_json.get("data", {})
|
|
|
- tag_seller = data.get("seller", {})
|
|
|
- seller_id = tag_seller.get("id")
|
|
|
- seller_name = tag_seller.get("name")
|
|
|
- photos = data.get("photos", [])
|
|
|
- photos = ','.join(photos) if photos else None
|
|
|
- # print(seller_id, seller_name, photos)
|
|
|
- return seller_id, seller_name, photos
|
|
|
-
|
|
|
-
|
|
|
def parse_list(log, resp_json, sql_pool, category_id, category_name):
|
|
|
"""
|
|
|
解析商品列表数据。
|
|
|
@@ -243,36 +210,30 @@ def parse_list(log, resp_json, sql_pool, category_id, category_name):
|
|
|
:param category_id: 类别ID
|
|
|
:param category_name: 类别名称
|
|
|
"""
|
|
|
+ log.info(f"解析商品列表数据............")
|
|
|
items = resp_json.get("items", [])
|
|
|
+
|
|
|
+ data_list = []
|
|
|
for item in items:
|
|
|
pid = item.get("id")
|
|
|
- # sellerId = item.get("sellerId")
|
|
|
+ seller_id = item.get("sellerId")
|
|
|
status = item.get("status")
|
|
|
product_name = item.get("name")
|
|
|
price = item.get("price")
|
|
|
|
|
|
created_at = item.get("created") # 1777512645 时间戳
|
|
|
- created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(created_at))) if int(created_at) else None
|
|
|
+ created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(created_at))) if created_at else None
|
|
|
updated_at = item.get("updated") # 1777512645 时间戳
|
|
|
- updated_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(updated_at))) if int(updated_at) else None
|
|
|
+ updated_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(updated_at))) if updated_at else None
|
|
|
|
|
|
- # thumbnails = item.get("thumbnails", [])
|
|
|
- # img = thumbnails[0] if thumbnails else None
|
|
|
-
|
|
|
- # categoryId = item.get("categoryId")
|
|
|
-
|
|
|
- # 获取详情页多图
|
|
|
- try:
|
|
|
- seller_id, seller_name, photos = get_detail_page(log, pid)
|
|
|
- except Exception as e:
|
|
|
- log.error(f"Error getting detail page: {e}")
|
|
|
- seller_id, seller_name, photos = None, None, None
|
|
|
+ thumbnails = item.get("thumbnails", [])
|
|
|
+ img = thumbnails[0] if thumbnails else None
|
|
|
|
|
|
data_dict = {
|
|
|
"pid": pid,
|
|
|
"seller_id": seller_id,
|
|
|
- "seller_name": seller_name,
|
|
|
- "photos": photos,
|
|
|
+ # "seller_name": seller_name,
|
|
|
+ "photos": img,
|
|
|
"status": status,
|
|
|
"product_name": product_name,
|
|
|
"price": price,
|
|
|
@@ -282,8 +243,11 @@ def parse_list(log, resp_json, sql_pool, category_id, category_name):
|
|
|
"category_name": category_name
|
|
|
}
|
|
|
# log.info(data_dict)
|
|
|
+ data_list.append(data_dict)
|
|
|
|
|
|
- sql_pool.insert_one_or_dict(table="mercari_record", data=data_dict, ignore=True)
|
|
|
+ # 批量插入数据
|
|
|
+ if data_list:
|
|
|
+ sql_pool.insert_many(table="mercari_record", data_list=data_list, ignore=True)
|
|
|
|
|
|
|
|
|
def iter_pages(
|
|
|
@@ -291,8 +255,8 @@ def iter_pages(
|
|
|
sql_pool,
|
|
|
category_id: int,
|
|
|
category_name: str,
|
|
|
- start_page: int = 1,
|
|
|
- end_page: int = 15000,
|
|
|
+ start_page: int,
|
|
|
+ end_page: int,
|
|
|
):
|
|
|
"""
|
|
|
循环请求多页,返回页码和 Response。
|
|
|
@@ -303,9 +267,6 @@ def iter_pages(
|
|
|
:param start_page: 开始页码
|
|
|
:param end_page: 结束页码
|
|
|
"""
|
|
|
- if category_id == 1289:
|
|
|
- start_page = 42
|
|
|
-
|
|
|
if end_page < start_page:
|
|
|
raise ValueError("end_page 必须大于等于 start_page")
|
|
|
|
|
|
@@ -360,7 +321,7 @@ def mercari_main(log):
|
|
|
category_id = category["category_id"]
|
|
|
category_name = category["category_name"]
|
|
|
log.debug(f'开始爬取类别 {category_name}............')
|
|
|
- iter_pages(log, sql_pool, category_id, category_name, start_page=1, end_page=100)
|
|
|
+ iter_pages(log, sql_pool, category_id, category_name, start_page=1, end_page=200)
|
|
|
except Exception as e:
|
|
|
log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
|
|
|
|
|
|
@@ -374,7 +335,7 @@ def schedule_task():
|
|
|
"""
|
|
|
设置定时任务
|
|
|
"""
|
|
|
- mercari_main(log=logger)
|
|
|
+ # mercari_main(log=logger)
|
|
|
|
|
|
schedule.every().day.at("04:00").do(mercari_main, log=logger)
|
|
|
while True:
|
|
|
@@ -383,6 +344,6 @@ def schedule_task():
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- # mercari_main(log=logger)
|
|
|
# get_detail_page(logger, "m69042262006")
|
|
|
+ # mercari_main(log=logger)
|
|
|
schedule_task()
|