hoopi_auction_premier_spider.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/8/14 14:27
  5. import inspect
  6. import requests
  7. from loguru import logger
  8. from mysql_pool import MySQLConnectionPool
  9. from tenacity import retry, stop_after_attempt, wait_fixed
  10. logger.remove()
  11. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  12. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  13. level="DEBUG", retention="7 day")
  14. category = "premier"
  15. max_page = 50
  16. country_name = 'Malaysia'
  17. headers = {
  18. "User-Agent": "okhttp/4.10.0",
  19. "Accept-Encoding": "gzip",
  20. "Content-Type": "application/json",
  21. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU1MTkyNDI2LCJ1c2VybmFtZSI6ImNoYXJsZXlfbGVvQDE2My5jb20ifQ.byBS1zj-LyD1mKHrCx9eLy5X2d0QzTO0FwApj2egSVI",
  22. "country": "1",
  23. "lang": "zh",
  24. "platform": "Android",
  25. "content-type": "application/json; charset=UTF-8"
  26. }
  27. def after_log(retry_state):
  28. """
  29. retry 回调
  30. :param retry_state: RetryCallState 对象
  31. """
  32. # 检查 args 是否存在且不为空
  33. if retry_state.args and len(retry_state.args) > 0:
  34. log = retry_state.args[0] # 获取传入的 logger
  35. else:
  36. log = logger # 使用全局 logger
  37. if retry_state.outcome.failed:
  38. log.warning(
  39. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  40. else:
  41. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  42. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  43. def get_premier_single_page(log, page_no):
  44. url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/listGoods"
  45. data = {
  46. "isTopStatus": "1",
  47. "sortType": "2",
  48. "containType": [
  49. "2"
  50. ],
  51. # "pageNo": 1,
  52. "pageNo": page_no,
  53. "premierGoods": "1",
  54. "pageSize": 15,
  55. "isSellOutShow": "1",
  56. "status": "2"
  57. }
  58. response = requests.post(url, headers=headers, json=data, timeout=22)
  59. # print(response.text)
  60. response.raise_for_status()
  61. if response.status_code == 200:
  62. result = response.json()
  63. if result["success"]:
  64. return result["result"]
  65. else:
  66. log.warning(f"result_message: {result['message']}")
  67. else:
  68. log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  69. return None
  70. def parse_list_items(log, items, sql_pool):
  71. log.info(f"{inspect.currentframe().f_code.co_name} Start parsing items")
  72. if items:
  73. info_list = []
  74. for item in items:
  75. item_id = item.get('id')
  76. data_dict = {
  77. "item_id": item_id,
  78. "category": category,
  79. "country_name": country_name
  80. }
  81. info_list.append(data_dict)
  82. if info_list:
  83. sql_pool.insert_many(table="hoopi_auction_record", data_list=info_list, ignore=True)
  84. else:
  85. log.warning(f" {inspect.currentframe().f_code.co_name} No items found")
  86. def get_premier_list(log, sql_pool):
  87. page = 1
  88. total_items = 0
  89. # while True:
  90. while page <= max_page:
  91. result = get_premier_single_page(log, page)
  92. if result is None:
  93. break
  94. items = result.get("list", [])
  95. if not items:
  96. log.debug("No items found on page %s", page)
  97. break
  98. try:
  99. parse_list_items(log, items, sql_pool)
  100. except Exception as e:
  101. log.error("Error parsing items on page %s: %s", page, e)
  102. total_items += len(items)
  103. pages = result.get("pages")
  104. total = result.get("total")
  105. # 判断条件 1: 根据 pages 判断
  106. if pages is not None and page >= pages:
  107. log.debug("已爬取 %s 页,共 %s 页" % (page, pages))
  108. break
  109. # 判断条件 2: 根据 list 的长度判断
  110. if len(items) < 15: # pageSize 为 15
  111. log.debug("已获取数据量小于15,停止爬取......................")
  112. break
  113. # 判断条件 3: 根据 total 和已获取数据量判断
  114. if total is not None and total_items >= total:
  115. log.debug("已获取数据量已满足要求,停止爬取......................")
  116. break
  117. page += 1
  118. # time.sleep(random.uniform(0.1, 0.5)) # 添加延时,避免频繁请求
  119. # ----------------------------------------------------------------------------------------------------------------------
  120. def parse_detail(log, item, sql_pool, item_id):
  121. log.debug("开始解析详情页数据........................")
  122. try:
  123. title = item.get('name')
  124. shopId = item.get('shopId')
  125. shopAppUserId = item.get('shopAppUserId')
  126. shopName = item.get('shopName')
  127. infoImgs = item.get('infoImgs') # 详情图片, 多图, 逗号分隔
  128. cardTypeName = item.get('cardTypeName') # 卡类型
  129. explainIntroduce = item.get('explainIntroduce') # 描述
  130. # 去除表情符号
  131. # if explainIntroduce:
  132. # explainIntroduce = emoji.replace_emoji(explainIntroduce, replace='')
  133. price = item.get('price')
  134. freightPrice = item.get('freightPrice') # 运费
  135. currency = item.get('currency') # 币种
  136. soldCount = item.get('soldCount') # 售出计数
  137. sellOffCount = item.get('sellOffCount') # 抛售计数
  138. status = item.get('status') # 2:售罄
  139. finishTime = item.get('finishTime')
  140. conditionTypeName = item.get('conditionTypeName') # 评级/状况
  141. countryName = item.get('countryName') # 国家
  142. shopSoldCount = item.get('shopSoldCount') # 店铺已售
  143. bidCount = item.get('bidCount') # 竞拍次数
  144. data_dict = {
  145. 'title': title,
  146. "shop_id": shopId,
  147. 'shop_name': shopName,
  148. 'shop_app_user_id': shopAppUserId,
  149. 'info_imgs': infoImgs,
  150. 'card_type_name': cardTypeName,
  151. 'explain_introduce': explainIntroduce,
  152. 'price': price,
  153. 'freight_price': freightPrice,
  154. 'currency': currency,
  155. 'sold_count': soldCount,
  156. 'sell_off_count': sellOffCount,
  157. 'status': status,
  158. 'finish_time': finishTime,
  159. 'condition_type_name': conditionTypeName,
  160. 'country_name': countryName,
  161. 'shop_sold_count': shopSoldCount,
  162. 'bid_count': bidCount,
  163. 'state': 1
  164. }
  165. # print('data_dict:',data_dict)
  166. try:
  167. sql_pool.update_one_or_dict(table='hoopi_auction_record', data=data_dict, condition={'item_id': item_id})
  168. log.success(f"----------------------- 更新成功, item_id: {item_id} -----------------------")
  169. except Exception as e:
  170. log.error(f'解析详情页数据 update_one_or_dict 报错:{e}')
  171. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 3}, condition={"item_id": item_id})
  172. except Exception as e:
  173. log.error(f'解析详情页数据error, {e}')
  174. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 3}, condition={"item_id": item_id})
  175. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  176. def get_detail(log, item_id, sql_pool):
  177. log.debug(f"开始获取详情页数据, item_id: {item_id}........................")
  178. # url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/1954822331293704194"
  179. url = f"https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/{item_id}"
  180. response = requests.post(url, headers=headers, timeout=10)
  181. # print(response.text)
  182. response.raise_for_status()
  183. data = response.json()
  184. if data['code'] == 200:
  185. result = data.get("result", {})
  186. parse_detail(log, result, sql_pool, item_id)
  187. else:
  188. log.error(f"获取详情页数据失败, item_id: {item_id}, msg:{data['message']}")
  189. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 3}, condition={"item_id": item_id})
  190. # -------------------------------------------------------------------------------------------------------------------
  191. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  192. def get_bid_list(log, item_id, sql_pool, token):
  193. log.debug(f"开始获取竞拍记录数据, item_id: {item_id}........................")
  194. headers_bid = {
  195. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU3OTUwODI0LCJ1c2VybmFtZSI6InRpYW56aHUxMDA5QGdtYWlsLmNvbSJ9.PkSn4I2evvlF27OfrxGidT-IwuuTo9nNDukuHSHSs0w"
  196. "x-access-token": token
  197. }
  198. copy_headers = headers.copy()
  199. copy_headers.update(headers_bid)
  200. # print(copy_headers)
  201. # url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goodsAuction/getBidRecordByGoodsId/1830661503251054593"
  202. url = f"https://cp.hoopi.xyz/hoopiserver/hoopi/api/goodsAuction/getBidRecordByGoodsId/{item_id}"
  203. response = requests.post(url, headers=copy_headers)
  204. # print(response.text)
  205. response.raise_for_status()
  206. if response.status_code == 200:
  207. biddings = response.json()["result"]
  208. """
  209. 获取 biddings 信息
  210. """
  211. # biddings = resp_json.get('biddings', [])
  212. # print(biddings)
  213. # 创建一个字典来存储每个用户的最高出价记录
  214. highest_bids = {}
  215. for record in biddings:
  216. username = record['appUserName']
  217. bid_price = float(record['bidPrice']) # 将出价转换为浮点数以便比较
  218. # 如果用户不在字典中,或者当前出价高于已存储的最高出价,则更新记录
  219. if username not in highest_bids or bid_price > float(highest_bids[username]['bidPrice']):
  220. highest_bids[username] = record
  221. bids_list = list(highest_bids.values())
  222. # print(highest_bids)
  223. # print(bids_list)
  224. biddings_list = [
  225. {
  226. 'item_id': item_id,
  227. 'bid_id': record['id'],
  228. 'user_id': record['appUserId'],
  229. 'username': record['appUserName'],
  230. 'bid_price': record['bidPrice'],
  231. 'bid_time': record['bidTime'],
  232. }
  233. for record in bids_list
  234. ]
  235. # print('biddings_list:', biddings_list)
  236. if biddings_list:
  237. sql_pool.insert_many(table='hoopi_auction_bid_record', data_list=biddings_list)
  238. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 1}, condition={"item_id": item_id})
  239. log.success(f"----------------------- 添加成功, item_id: {item_id} -----------------------")
  240. else:
  241. log.warning(f"----------------------- 添加失败, item_id: {item_id} -----------------------")
  242. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 2}, condition={"item_id": item_id})
  243. else:
  244. log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  245. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 3}, condition={"item_id": item_id})
  246. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  247. def hoopi_premier_main(log):
  248. """
  249. 主函数
  250. :param log: logger对象
  251. """
  252. log.info(
  253. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  254. # 配置 MySQL 连接池
  255. sql_pool = MySQLConnectionPool(log=log)
  256. if not sql_pool.check_pool_health():
  257. log.error("数据库连接池异常")
  258. raise RuntimeError("数据库连接池异常")
  259. try:
  260. try:
  261. # 获取已售出商品列表
  262. log.debug(f"开始获取已售出商品列表, category: {category}........................")
  263. get_premier_list(log, sql_pool)
  264. # 获取商品详情
  265. log.debug(f"开始获取商品详情, category: {category}........................")
  266. sql_ietm_id_list = sql_pool.select_all(
  267. f"SELECT item_id FROM hoopi_auction_record WHERE state != 1 AND category = '{category}' AND country_name = '{country_name}'")
  268. sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
  269. for item_id in sql_ietm_id_list:
  270. try:
  271. get_detail(log, item_id, sql_pool)
  272. except Exception as e:
  273. log.error(f"Request get_detail error: {e}")
  274. # 获取商品出价列表
  275. log.debug(f"开始获取商品出价列表, category: {category}........................")
  276. # 获取 token
  277. token = sql_pool.select_one("SELECT token FROM hoopi_token")
  278. token = token[0]
  279. sql_bid_state_list = sql_pool.select_all(
  280. f"SELECT item_id FROM hoopi_auction_record WHERE bid_state != 1 AND category = '{category}' AND country_name = '{country_name}'")
  281. sql_bid_state_list = [item_id[0] for item_id in sql_bid_state_list]
  282. for item_id in sql_bid_state_list:
  283. try:
  284. get_bid_list(log, item_id, sql_pool, token)
  285. except Exception as e:
  286. log.error(f"Request get_bid_list error: {e}")
  287. except Exception as e:
  288. log.error(f"Request get_shop_data_list error: {e}")
  289. except Exception as e:
  290. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  291. finally:
  292. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  293. # EmailSender().send(subject="【千岛 拍卖 - 爬虫通知】今日任务已完成",
  294. # content="数据采集和处理已全部完成,请查收结果。\n\n ------ 来自 Python 爬虫系统。")
  295. if __name__ == '__main__':
  296. # get_general_list(logger, None)
  297. # get_bid_list(logger, '1830661503251054593', None)
  298. hoopi_premier_main(logger)