(暂无)hoopi_mall_toys_spider.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/8/12 16:59
  5. import random
  6. import time
  7. import inspect
  8. import requests
  9. import schedule
  10. from loguru import logger
  11. from mysql_pool import MySQLConnectionPool
  12. from tenacity import retry, stop_after_attempt, wait_fixed
  13. # logger.remove()
  14. # logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  15. # format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  16. # level="DEBUG", retention="7 day")
  17. category = "潮玩"
  18. max_page = 50
  19. def after_log(retry_state):
  20. """
  21. retry 回调
  22. :param retry_state: RetryCallState 对象
  23. """
  24. # 检查 args 是否存在且不为空
  25. if retry_state.args and len(retry_state.args) > 0:
  26. log = retry_state.args[0] # 获取传入的 logger
  27. else:
  28. log = logger # 使用全局 logger
  29. if retry_state.outcome.failed:
  30. log.warning(
  31. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  32. else:
  33. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  34. def get_single_page(log, page_no):
  35. headers = {
  36. "User-Agent": "okhttp/4.10.0",
  37. "Accept-Encoding": "gzip",
  38. "Content-Type": "application/json",
  39. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU1MTkyNDI2LCJ1c2VybmFtZSI6ImNoYXJsZXlfbGVvQDE2My5jb20ifQ.byBS1zj-LyD1mKHrCx9eLy5X2d0QzTO0FwApj2egSVI",
  40. "country": "1",
  41. "lang": "zh",
  42. "platform": "Android",
  43. "content-type": "application/json; charset=UTF-8"
  44. }
  45. url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/toy/listToys"
  46. data = {
  47. "sortType": "2",
  48. "containType": [
  49. "0"
  50. ],
  51. "pageNo": page_no,
  52. "pageSize": 10
  53. }
  54. response = requests.post(url, headers=headers, json=data)
  55. print(response.text)
  56. response.raise_for_status()
  57. if response.status_code == 200:
  58. result = response.json()
  59. if result["success"]:
  60. return result["result"]
  61. else:
  62. log.warning(f"result_message: {result['message']}")
  63. else:
  64. log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  65. return None
  66. def parse_list_items(log, items, sql_pool):
  67. log.info(f"{inspect.currentframe().f_code.co_name} Start parsing items")
  68. if items:
  69. info_list = []
  70. for item in items:
  71. item_id = item.get('id')
  72. info_list.append(item_id)
  73. if info_list:
  74. sql_pool.insert_many(table="hoopi_mall_record", data_list=info_list, ignore=True)
  75. else:
  76. log.warning(f" {inspect.currentframe().f_code.co_name} No items found")
  77. def get_mall_sold_list(log, sql_pool):
  78. page = 1
  79. total_items = 0
  80. # while True:
  81. while page <= max_page:
  82. result = get_single_page(log, page)
  83. if result is None:
  84. break
  85. items = result.get("list", [])
  86. if not items:
  87. log.warning("No items found on page %s", page)
  88. break
  89. try:
  90. parse_list_items(log, items, sql_pool)
  91. except Exception as e:
  92. log.error("Error parsing items on page %s: %s", page, e)
  93. total_items += len(items)
  94. pages = result.get("pages")
  95. total = result.get("total")
  96. # 判断条件 1: 根据 pages 判断
  97. if pages is not None and page >= pages:
  98. log.debug("已爬取 %s 页,共 %s 页" % (page, pages))
  99. break
  100. # 判断条件 2: 根据 list 的长度判断
  101. if len(items) < 10: # pageSize 为 10
  102. log.debug("已获取数据量小于15,停止爬取......................")
  103. break
  104. # 判断条件 3: 根据 total 和已获取数据量判断
  105. if total is not None and total_items >= total:
  106. log.debug("已获取数据量已满足要求,停止爬取......................")
  107. break
  108. page += 1
  109. # time.sleep(random.uniform(0.1, 0.5)) # 添加延时,避免频繁请求
  110. def parse_detail(log, item, sql_pool, item_id):
  111. log.debug("开始解析详情页数据........................")
  112. try:
  113. title = item.get('name')
  114. shopId = item.get('shopId')
  115. shopAppUserId = item.get('shopAppUserId')
  116. shopName = item.get('shopName')
  117. infoImgs = item.get('infoImgs') # 详情图片, 多图, 逗号分隔
  118. cardTypeName = item.get('cardTypeName') # 卡类型
  119. explainIntroduce = item.get('explainIntroduce') # 描述
  120. price = item.get('price')
  121. freightPrice = item.get('freightPrice') # 运费
  122. currency = item.get('currency') # 币种
  123. soldCount = item.get('soldCount') # 售出计数
  124. sellOffCount = item.get('sellOffCount') # 抛售计数
  125. status = item.get('status') # 2:售罄
  126. finishTime = item.get('finishTime')
  127. conditionTypeName = item.get('conditionTypeName') # 评级/状况
  128. countryName = item.get('countryName') # 国家
  129. shopSoldCount = item.get('shopSoldCount') # 店铺已售
  130. data_dict = {
  131. 'title': title,
  132. "shop_id": shopId,
  133. 'shop_name': shopName,
  134. 'shop_app_user_id': shopAppUserId,
  135. 'info_imgs': infoImgs,
  136. 'card_type_name': cardTypeName,
  137. 'explain_introduce': explainIntroduce,
  138. 'price': price,
  139. 'freight_price': freightPrice,
  140. 'currency': currency,
  141. 'sold_count': soldCount,
  142. 'sell_off_count': sellOffCount,
  143. 'status': status,
  144. 'finish_time': finishTime,
  145. 'condition_type_name': conditionTypeName,
  146. 'country_name': countryName,
  147. 'shop_sold_count': shopSoldCount
  148. }
  149. print(data_dict)
  150. # try:
  151. # sql_pool.update_one_or_dict(table='hoopi_mall_record', data=data_dict, condition={'item_id': item_id})
  152. # except Exception as e:
  153. # log.error(f'解析详情页数据 update_one_or_dict 报错:{e[:500]}')
  154. except Exception as e:
  155. log.error(f'解析详情页数据error, {e[:500]}')
  156. def get_detail(log, item_id, sql_pool):
  157. headers = {
  158. "User-Agent": "okhttp/4.10.0",
  159. "Accept-Encoding": "gzip",
  160. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU1MTkyNDI2LCJ1c2VybmFtZSI6ImNoYXJsZXlfbGVvQDE2My5jb20ifQ.byBS1zj-LyD1mKHrCx9eLy5X2d0QzTO0FwApj2egSVI",
  161. "country": "1",
  162. "lang": "zh",
  163. "platform": "Android",
  164. "content-length": "0"
  165. }
  166. url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/1954822331293704194"
  167. # url = f"https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/{item_id}"
  168. response = requests.post(url, headers=headers)
  169. print(response.text)
  170. response.raise_for_status()
  171. data = response.json()
  172. result = data.get("result", {})
  173. parse_detail(log, result, sql_pool, item_id)
  174. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  175. def hoopi_mall_main(log):
  176. """
  177. 主函数
  178. :param log: logger对象
  179. """
  180. log.info(
  181. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  182. # 配置 MySQL 连接池
  183. sql_pool = MySQLConnectionPool(log=log)
  184. if not sql_pool.check_pool_health():
  185. log.error("数据库连接池异常")
  186. raise RuntimeError("数据库连接池异常")
  187. try:
  188. try:
  189. pass
  190. except Exception as e:
  191. log.error(f"Request get_shop_data_list error: {e}")
  192. except Exception as e:
  193. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  194. finally:
  195. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  196. # EmailSender().send(subject="【千岛 拍卖 - 爬虫通知】今日任务已完成",
  197. # content="数据采集和处理已全部完成,请查收结果。\n\n ------ 来自 Python 爬虫系统。")
  198. def schedule_task():
  199. """
  200. 爬虫模块 定时任务 的启动文件
  201. """
  202. # 立即运行一次任务
  203. # hoopi_mall_main(log=logger)
  204. # 设置定时任务
  205. schedule.every().day.at("01:06").do(hoopi_mall_main, log=logger)
  206. while True:
  207. schedule.run_pending()
  208. time.sleep(1)
  209. if __name__ == '__main__':
  210. # get_mall_sold_list(logger, None)
  211. get_detail(logger, "1954822331293704194", None)