hoopi_box_spider.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/8/14 15:55
  5. import inspect
  6. import requests
  7. from loguru import logger
  8. from mysql_pool import MySQLConnectionPool
  9. from tenacity import retry, stop_after_attempt, wait_fixed
  10. logger.remove()
  11. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  12. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  13. level="DEBUG", retention="7 day")
  14. country_name = "Malaysia"
  15. max_page = 50
  16. def after_log(retry_state):
  17. """
  18. retry 回调
  19. :param retry_state: RetryCallState 对象
  20. """
  21. # 检查 args 是否存在且不为空
  22. if retry_state.args and len(retry_state.args) > 0:
  23. log = retry_state.args[0] # 获取传入的 logger
  24. else:
  25. log = logger # 使用全局 logger
  26. if retry_state.outcome.failed:
  27. log.warning(
  28. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  29. else:
  30. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  31. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  32. def get_box_single_page(log, page_no, brand_id):
  33. log.debug(f"Start {inspect.currentframe().f_code.co_name}, page:{page_no}")
  34. headers = {
  35. "User-Agent": "okhttp/4.10.0",
  36. "Accept-Encoding": "gzip",
  37. "Content-Type": "application/json",
  38. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU1MTkyNDI2LCJ1c2VybmFtZSI6ImNoYXJsZXlfbGVvQDE2My5jb20ifQ.byBS1zj-LyD1mKHrCx9eLy5X2d0QzTO0FwApj2egSVI",
  39. "country": "1",
  40. "lang": "zh",
  41. "platform": "Android",
  42. "content-type": "application/json; charset=UTF-8"
  43. }
  44. url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/box/listBox"
  45. data = {
  46. "sortType": "6",
  47. # "pageNo": 1,
  48. "pageNo": page_no,
  49. # "brandId": "1934927046253977602",
  50. "brandId": brand_id,
  51. "pageSize": 20
  52. }
  53. response = requests.post(url, headers=headers, json=data, timeout=22)
  54. # print(response.text)
  55. response.raise_for_status()
  56. if response.status_code == 200:
  57. result = response.json()
  58. if result["success"]:
  59. return result["result"]
  60. else:
  61. log.warning(f"result_message: {result['message']}")
  62. else:
  63. log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  64. return None
  65. def parse_list_items(log, items, sql_pool, brand_name):
  66. log.info(f"{inspect.currentframe().f_code.co_name} Start parsing items")
  67. if items:
  68. info_list = []
  69. for item in items:
  70. item_id = item.get("id")
  71. title = item.get("name")
  72. releaseTime = item.get("releaseTime")
  73. soldCount = item.get("soldCount")
  74. sellOffCount = item.get("sellOffCount")
  75. price = item.get("price") # 原价
  76. discountedPrice = item.get("discountedPrice") # 优惠价
  77. currency = item.get("currency") # 币种
  78. status = item.get("status")
  79. boxImg = item.get("boxImg")
  80. data_dict = {
  81. "item_id": item_id,
  82. "category": brand_name,
  83. "title": title,
  84. "release_time": releaseTime,
  85. "sold_count": soldCount,
  86. "sell_off_count": sellOffCount,
  87. "price": price,
  88. "discounted_price": discountedPrice,
  89. "currency": currency,
  90. "status": status,
  91. "box_img": boxImg,
  92. "country_name": country_name
  93. }
  94. # print('data_dict:', data_dict)
  95. info_list.append(data_dict)
  96. if info_list:
  97. sql_pool.insert_many(table="hoopi_box_record", data_list=info_list, ignore=True)
  98. else:
  99. log.warning(f" {inspect.currentframe().f_code.co_name} No items found")
  100. def get_box_list(log, sql_pool, brand_name, brand_id):
  101. page = 1
  102. total_items = 0
  103. # while True:
  104. while page <= max_page:
  105. result = get_box_single_page(log, page, brand_id)
  106. if result is None:
  107. break
  108. items = result.get("list", [])
  109. if not items:
  110. log.debug("No items found on page %s", page)
  111. break
  112. try:
  113. parse_list_items(log, items, sql_pool, brand_name)
  114. except Exception as e:
  115. log.error("Error parsing items on page %s: %s", page, e)
  116. total_items += len(items)
  117. pages = result.get("pages")
  118. total = result.get("total")
  119. # 判断条件 1: 根据 pages 判断
  120. if pages is not None and page >= pages:
  121. log.debug("已爬取 %s 页,共 %s 页" % (page, pages))
  122. break
  123. # 判断条件 2: 根据 list 的长度判断
  124. if len(items) < 20: # pageSize 为 20
  125. log.debug("已获取数据量小于 20,停止爬取......................")
  126. break
  127. # 判断条件 3: 根据 total 和已获取数据量判断
  128. if total is not None and total_items >= total:
  129. log.debug("已获取数据量已满足要求,停止爬取......................")
  130. break
  131. page += 1
  132. # time.sleep(random.uniform(0.1, 0.5)) # 添加延时,避免频繁请求
  133. # ----------------------------------------------------------------------------------------------------------------------
  134. def parse_detail(log, result, sql_pool, item_id):
  135. # log.debug(f"开始解析详情页数据, item_id:{item_id}........................")
  136. try:
  137. shopId = result.get("shopId")
  138. backImg = result.get("backImg")
  139. detailsImg = result.get("detailsImg")
  140. isRecycling = result.get("isRecycling") # 是否回收 str
  141. totalCount = result.get("totalCount") # 总数 int
  142. data_dict = {
  143. "shop_id": shopId,
  144. "back_img": backImg,
  145. "details_img": detailsImg,
  146. "is_recycling": isRecycling,
  147. "total_count": totalCount
  148. }
  149. # print('data_dict:',data_dict)
  150. try:
  151. sql_pool.update_one_or_dict(table='hoopi_box_record', data=data_dict, condition={'item_id': item_id})
  152. log.success(f"----------------------- 更新成功, item_id: {item_id} -----------------------")
  153. except Exception as e:
  154. log.error(f'解析详情页数据 update_one_or_dict 报错:{e}')
  155. sql_pool.update_one_or_dict(table="hoopi_box_record", data={"state": 3}, condition={"item_id": item_id})
  156. except Exception as e:
  157. log.error(f'解析详情页数据error, {e}')
  158. sql_pool.update_one_or_dict(table="hoopi_box_record", data={"state": 3}, condition={"item_id": item_id})
  159. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  160. def get_detail(log, item_id, sql_pool):
  161. log.debug(f"开始获取详情页数据, item_id: {item_id}........................")
  162. headers = {
  163. "User-Agent": "okhttp/4.10.0",
  164. "Accept-Encoding": "gzip",
  165. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU1MTkyNDI2LCJ1c2VybmFtZSI6ImNoYXJsZXlfbGVvQDE2My5jb20ifQ.byBS1zj-LyD1mKHrCx9eLy5X2d0QzTO0FwApj2egSVI",
  166. "country": "1",
  167. "lang": "zh",
  168. "platform": "Android",
  169. "content-length": "0"
  170. }
  171. url = f"https://cp.hoopi.xyz/hoopiserver/hoopi/api/box/getBoxInfo/{item_id}"
  172. response = requests.post(url, headers=headers, timeout=10)
  173. # print(response.text)
  174. # time.sleep(11111)
  175. response.raise_for_status()
  176. data = response.json()
  177. if data['code'] == 200:
  178. result = data.get("result", {})
  179. parse_detail(log, result, sql_pool, item_id)
  180. else:
  181. log.error(f"获取详情页数据失败, item_id: {item_id}, msg:{data['message']}")
  182. sql_pool.update_one_or_dict(table="hoopi_box_record", data={"state": 3}, condition={"item_id": item_id})
  183. # ----------------------------------------------------------------------------------------------------------------------
  184. # def parse_carousel_list(log, result_json, sql_pool, item_id):
  185. # for item in result_json:
  186. # result_id = item.get("id")
  187. # boxGradeId = item.get("boxGradeId")
  188. # gradeType = item.get("gradeType")
  189. # title = item.get("name")
  190. # coverImg = item.get("coverImg")
  191. # priceText = item.get("priceText")
  192. # data_dict = {
  193. # "item_id": item_id,
  194. # "result_id": result_id,
  195. # "boxGradeId": boxGradeId,
  196. # "gradeType": gradeType,
  197. # "title": title,
  198. # "coverImg": coverImg,
  199. # "priceText": priceText
  200. # }
  201. # print('data_dict:', data_dict)
  202. #
  203. #
  204. # def get_carousel_list(log, item_id, sql_pool):
  205. # """
  206. # 获取轮播图列表
  207. # :param log:
  208. # :param item_id:
  209. # :param sql_pool:
  210. # :return:
  211. # """
  212. # headers = {
  213. # "User-Agent": "okhttp/4.10.0",
  214. # "Accept-Encoding": "gzip",
  215. # # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU1MTkyNDI2LCJ1c2VybmFtZSI6ImNoYXJsZXlfbGVvQDE2My5jb20ifQ.byBS1zj-LyD1mKHrCx9eLy5X2d0QzTO0FwApj2egSVI",
  216. # "country": "1",
  217. # "lang": "zh",
  218. # "platform": "Android",
  219. # "content-length": "0"
  220. # }
  221. # url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/box/listBoxGradePrize"
  222. # data = {
  223. # "boxId": "1953726337084772353"
  224. # }
  225. # response = requests.post(url, headers=headers, json=data)
  226. # print(response.text)
  227. # response.raise_for_status()
  228. #
  229. # if response.status_code == 200:
  230. # result = response.json()
  231. # if result["success"]:
  232. # result_json = result["result"]
  233. # parse_carousel_list(log, result_json, sql_pool, item_id)
  234. # else:
  235. # log.warning(f"result_message: {result['message']}")
  236. # else:
  237. # log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  238. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  239. def hoopi_box_main(log):
  240. """
  241. 主函数
  242. :param log: logger对象
  243. """
  244. log.info(
  245. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  246. # 配置 MySQL 连接池
  247. sql_pool = MySQLConnectionPool(log=log)
  248. if not sql_pool.check_pool_health():
  249. log.error("数据库连接池异常")
  250. raise RuntimeError("数据库连接池异常")
  251. try:
  252. try:
  253. # 获取已售出商品列表
  254. log.debug(f"开始获取已售出商品列表........................")
  255. brand_list = {
  256. "pokemon": "1934927046253977602",
  257. "mix": "1955185420300132353",
  258. "one piece": "1934927236365000706"
  259. }
  260. for brand_name, brand_id in brand_list.items():
  261. log.info(f"开始处理品牌: {brand_name}, ID: {brand_id}")
  262. get_box_list(log, sql_pool, brand_name, brand_id)
  263. log.success(f"获取已售出商品列表 Finished")
  264. # 获取商品详情
  265. log.debug(f"开始获取商品详情........................")
  266. sql_ietm_id_list = sql_pool.select_all(f"SELECT item_id FROM hoopi_box_record WHERE state != 1 AND country_name = '{country_name}'")
  267. # f"SELECT item_id FROM hoopi_box_record WHERE state != 1 AND category = '{category}'")
  268. sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
  269. for item_id in sql_ietm_id_list:
  270. try:
  271. get_detail(log, item_id, sql_pool)
  272. except Exception as e:
  273. log.error(f"Request get_detail error: {e}")
  274. log.success(f"获取商品详情 Finished")
  275. except Exception as e:
  276. log.error(f"Request get_shop_data_list error: {e}")
  277. except Exception as e:
  278. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  279. finally:
  280. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  281. # EmailSender().send(subject="【千岛 拍卖 - 爬虫通知】今日任务已完成",
  282. # content="数据采集和处理已全部完成,请查收结果。\n\n ------ 来自 Python 爬虫系统。")
  283. if __name__ == '__main__':
  284. # get_box_list(logger, None, None, None)
  285. hoopi_box_main(logger)