hoopi_auction_general_spider.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/8/14 14:27
  5. import inspect
  6. import requests
  7. from loguru import logger
  8. from mysql_pool import MySQLConnectionPool
  9. from tenacity import retry, stop_after_attempt, wait_fixed
  10. logger.remove()
  11. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  12. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  13. level="DEBUG", retention="7 day")
  14. category = "general"
  15. max_page = 50
  16. country_name = 'Malaysia'
  17. headers = {
  18. "User-Agent": "okhttp/4.10.0",
  19. "Accept-Encoding": "gzip",
  20. "Content-Type": "application/json",
  21. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU1MTkyNDI2LCJ1c2VybmFtZSI6ImNoYXJsZXlfbGVvQDE2My5jb20ifQ.byBS1zj-LyD1mKHrCx9eLy5X2d0QzTO0FwApj2egSVI",
  22. "country": "1",
  23. "lang": "zh",
  24. "platform": "Android",
  25. "content-type": "application/json; charset=UTF-8"
  26. }
  27. def after_log(retry_state):
  28. """
  29. retry 回调
  30. :param retry_state: RetryCallState 对象
  31. """
  32. # 检查 args 是否存在且不为空
  33. if retry_state.args and len(retry_state.args) > 0:
  34. log = retry_state.args[0] # 获取传入的 logger
  35. else:
  36. log = logger # 使用全局 logger
  37. if retry_state.outcome.failed:
  38. log.warning(
  39. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  40. else:
  41. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  42. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  43. def get_general_single_page(log, page_no):
  44. log.debug(f"{inspect.currentframe().f_code.co_name} Start get general single page:{page_no}")
  45. url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/listGoods"
  46. data = {
  47. "sortType": "2",
  48. "containType": [
  49. "2"
  50. ],
  51. # "pageNo": 1,
  52. "pageNo": page_no,
  53. "premierGoods": "0",
  54. "pageSize": 15,
  55. "isSellOutShow": "1",
  56. "status": "2"
  57. }
  58. # {
  59. # "sortType": "2",
  60. # "containType": [
  61. # "2"
  62. # ],
  63. # "pageNo": 1,
  64. # "premierGoods": "0",
  65. # "pageSize": 15,
  66. # "isSellOutShow": "1",
  67. # "status": "2"
  68. # }
  69. response = requests.post(url, headers=headers, json=data, timeout=22)
  70. # print(response.text)
  71. response.raise_for_status()
  72. if response.status_code == 200:
  73. result = response.json()
  74. if result["success"]:
  75. return result["result"]
  76. else:
  77. log.warning(f"result_message: {result['message']}")
  78. else:
  79. log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  80. return None
  81. def parse_list_items(log, items, sql_pool):
  82. log.info(f"{inspect.currentframe().f_code.co_name} Start parsing items")
  83. if items:
  84. info_list = []
  85. for item in items:
  86. item_id = item.get('id')
  87. log.debug(f'item_id:{item_id}')
  88. data_dict = {
  89. "item_id": item_id,
  90. "category": category,
  91. "country_name": country_name
  92. }
  93. info_list.append(data_dict)
  94. if info_list:
  95. sql_pool.insert_many(table="hoopi_auction_record", data_list=info_list, ignore=True)
  96. else:
  97. log.warning(f" {inspect.currentframe().f_code.co_name} No items found")
  98. def get_general_list(log, sql_pool):
  99. page = 1
  100. total_items = 0
  101. # while True:
  102. while page <= max_page:
  103. result = get_general_single_page(log, page)
  104. if result is None:
  105. break
  106. items = result.get("list", [])
  107. if not items:
  108. log.debug("No items found on page %s", page)
  109. break
  110. try:
  111. parse_list_items(log, items, sql_pool)
  112. except Exception as e:
  113. log.error("Error parsing items on page %s: %s", page, e)
  114. total_items += len(items)
  115. pages = result.get("pages")
  116. total = result.get("total")
  117. # 判断条件 1: 根据 pages 判断
  118. if pages is not None and page >= pages:
  119. log.debug("已爬取 %s 页,共 %s 页" % (page, pages))
  120. break
  121. # 判断条件 2: 根据 list 的长度判断
  122. if len(items) < 15: # pageSize 为 15
  123. log.debug("已获取数据量小于15,停止爬取......................")
  124. break
  125. # 判断条件 3: 根据 total 和已获取数据量判断
  126. if total is not None and total_items >= total:
  127. log.debug("已获取数据量已满足要求,停止爬取......................")
  128. break
  129. page += 1
  130. # time.sleep(random.uniform(0.1, 0.5)) # 添加延时,避免频繁请求
  131. # ----------------------------------------------------------------------------------------------------------------------
  132. def parse_detail(log, item, sql_pool, item_id):
  133. log.debug("开始解析详情页数据........................")
  134. try:
  135. title = item.get('name')
  136. shopId = item.get('shopId')
  137. shopAppUserId = item.get('shopAppUserId')
  138. shopName = item.get('shopName')
  139. infoImgs = item.get('infoImgs') # 详情图片, 多图, 逗号分隔
  140. cardTypeName = item.get('cardTypeName') # 卡类型
  141. explainIntroduce = item.get('explainIntroduce') # 描述
  142. # 去除表情符号
  143. # if explainIntroduce:
  144. # explainIntroduce = emoji.replace_emoji(explainIntroduce, replace='')
  145. price = item.get('price')
  146. freightPrice = item.get('freightPrice') # 运费
  147. currency = item.get('currency') # 币种
  148. soldCount = item.get('soldCount') # 售出计数
  149. sellOffCount = item.get('sellOffCount') # 抛售计数
  150. status = item.get('status') # 2:售罄
  151. finishTime = item.get('finishTime')
  152. conditionTypeName = item.get('conditionTypeName') # 评级/状况
  153. countryName = item.get('countryName') # 国家
  154. shopSoldCount = item.get('shopSoldCount') # 店铺已售
  155. bidCount = item.get('bidCount') # 竞拍次数
  156. data_dict = {
  157. 'title': title,
  158. "shop_id": shopId,
  159. 'shop_name': shopName,
  160. 'shop_app_user_id': shopAppUserId,
  161. 'info_imgs': infoImgs,
  162. 'card_type_name': cardTypeName,
  163. 'explain_introduce': explainIntroduce,
  164. 'price': price,
  165. 'freight_price': freightPrice,
  166. 'currency': currency,
  167. 'sold_count': soldCount,
  168. 'sell_off_count': sellOffCount,
  169. 'status': status,
  170. 'finish_time': finishTime,
  171. 'condition_type_name': conditionTypeName,
  172. 'country_name': countryName,
  173. 'shop_sold_count': shopSoldCount,
  174. 'bid_count': bidCount,
  175. 'state': 1
  176. }
  177. # print('data_dict:',data_dict)
  178. try:
  179. sql_pool.update_one_or_dict(table='hoopi_auction_record', data=data_dict, condition={'item_id': item_id})
  180. log.success(f"----------------------- 更新成功, item_id: {item_id} -----------------------")
  181. except Exception as e:
  182. log.error(f'解析详情页数据 update_one_or_dict 报错:{e}')
  183. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 3}, condition={"item_id": item_id})
  184. except Exception as e:
  185. log.error(f'解析详情页数据error, {e}')
  186. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 3}, condition={"item_id": item_id})
  187. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  188. def get_detail(log, item_id, sql_pool):
  189. log.debug(f"开始获取详情页数据, item_id: {item_id}........................")
  190. # url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/1954822331293704194"
  191. url = f"https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/{item_id}"
  192. response = requests.post(url, headers=headers, timeout=10)
  193. # print(response.text)
  194. response.raise_for_status()
  195. data = response.json()
  196. if data['code'] == 200:
  197. result = data.get("result", {})
  198. parse_detail(log, result, sql_pool, item_id)
  199. else:
  200. log.error(f"获取详情页数据失败, item_id: {item_id}, msg:{data['message']}")
  201. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"state": 3}, condition={"item_id": item_id})
  202. # -------------------------------------------------------------------------------------------------------------------
  203. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  204. def get_bid_list(log, item_id, sql_pool, token):
  205. log.debug(f"开始获取竞拍记录数据, item_id: {item_id}........................")
  206. headers_bid = {
  207. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU3OTUwODI0LCJ1c2VybmFtZSI6InRpYW56aHUxMDA5QGdtYWlsLmNvbSJ9.PkSn4I2evvlF27OfrxGidT-IwuuTo9nNDukuHSHSs0w"
  208. "x-access-token": token
  209. }
  210. copy_headers = headers.copy()
  211. copy_headers.update(headers_bid)
  212. # print(copy_headers)
  213. # url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goodsAuction/getBidRecordByGoodsId/1830661503251054593"
  214. url = f"https://cp.hoopi.xyz/hoopiserver/hoopi/api/goodsAuction/getBidRecordByGoodsId/{item_id}"
  215. response = requests.post(url, headers=copy_headers)
  216. # print(response.text)
  217. response.raise_for_status()
  218. if response.status_code == 200:
  219. biddings = response.json()["result"]
  220. """
  221. 获取 biddings 信息
  222. """
  223. # biddings = resp_json.get('biddings', [])
  224. # print(biddings)
  225. # 创建一个字典来存储每个用户的最高出价记录
  226. highest_bids = {}
  227. for record in biddings:
  228. username = record['appUserName']
  229. bid_price = float(record['bidPrice']) # 将出价转换为浮点数以便比较
  230. # 如果用户不在字典中,或者当前出价高于已存储的最高出价,则更新记录
  231. if username not in highest_bids or bid_price > float(highest_bids[username]['bidPrice']):
  232. highest_bids[username] = record
  233. bids_list = list(highest_bids.values())
  234. # print(highest_bids)
  235. # print(bids_list)
  236. biddings_list = [
  237. {
  238. 'item_id': item_id,
  239. 'bid_id': record['id'],
  240. 'user_id': record['appUserId'],
  241. 'username': record['appUserName'],
  242. 'bid_price': record['bidPrice'],
  243. 'bid_time': record['bidTime'],
  244. }
  245. for record in bids_list
  246. ]
  247. # print('biddings_list:', biddings_list)
  248. if biddings_list:
  249. sql_pool.insert_many(table='hoopi_auction_bid_record', data_list=biddings_list)
  250. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"bid_state": 1},
  251. condition={"item_id": item_id})
  252. log.success(f"----------------------- 添加成功, item_id: {item_id} -----------------------")
  253. else:
  254. log.warning(f"----------------------- 添加失败, item_id: {item_id} -----------------------")
  255. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"bid_state": 2},
  256. condition={"item_id": item_id})
  257. else:
  258. log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  259. sql_pool.update_one_or_dict(table="hoopi_auction_record", data={"bid_state": 3}, condition={"item_id": item_id})
  260. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  261. def hoopi_general_main(log):
  262. """
  263. 主函数
  264. :param log: logger对象
  265. """
  266. log.info(
  267. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  268. # 配置 MySQL 连接池
  269. sql_pool = MySQLConnectionPool(log=log)
  270. if not sql_pool.check_pool_health():
  271. log.error("数据库连接池异常")
  272. raise RuntimeError("数据库连接池异常")
  273. try:
  274. try:
  275. # 获取已售出商品列表
  276. log.debug(f"开始获取已售出商品列表, category: {category}........................")
  277. get_general_list(log, sql_pool)
  278. # 获取商品详情
  279. log.debug(f"开始获取商品详情, category: {category}........................")
  280. sql_ietm_id_list = sql_pool.select_all(
  281. f"SELECT item_id FROM hoopi_auction_record WHERE state != 1 AND category = '{category}' AND country_name = '{country_name}'")
  282. sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
  283. for item_id in sql_ietm_id_list:
  284. try:
  285. get_detail(log, item_id, sql_pool)
  286. except Exception as e:
  287. log.error(f"Request get_detail error: {e}")
  288. # 获取商品出价列表
  289. log.debug(f"开始获取商品出价列表, category: {category}........................")
  290. # 获取 token
  291. token = sql_pool.select_one("SELECT token FROM hoopi_token")
  292. token = token[0]
  293. sql_bid_state_list = sql_pool.select_all(
  294. f"SELECT item_id FROM hoopi_auction_record WHERE bid_state != 1 AND category = '{category}' AND country_name = '{country_name}'")
  295. sql_bid_state_list = [item_id[0] for item_id in sql_bid_state_list]
  296. for item_id in sql_bid_state_list:
  297. try:
  298. get_bid_list(log, item_id, sql_pool, token)
  299. except Exception as e:
  300. log.error(f"Request get_bid_list error: {e}")
  301. except Exception as e:
  302. log.error(f"Request get_shop_data_list error: {e}")
  303. except Exception as e:
  304. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  305. finally:
  306. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  307. # EmailSender().send(subject="【千岛 拍卖 - 爬虫通知】今日任务已完成",
  308. # content="数据采集和处理已全部完成,请查收结果。\n\n ------ 来自 Python 爬虫系统。")
  309. if __name__ == '__main__':
  310. # get_general_list(logger, None)
  311. # get_bid_list(logger, '1830661503251054593', None)
  312. hoopi_general_main(logger)