in_mall_card_spider.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/9/12 16:05
  5. import time
  6. import inspect
  7. import requests
  8. import schedule
  9. from loguru import logger
  10. from mysql_pool import MySQLConnectionPool
  11. from tenacity import retry, stop_after_attempt, wait_fixed
  12. logger.remove()
  13. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  14. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  15. level="DEBUG", retention="7 day")
  16. category = "卡牌"
  17. max_page = 50
  18. country_name = 'Indonesia'
  19. """
  20. 印度尼西亚
  21. """
  22. headers = {
  23. "User-Agent": "okhttp/4.10.0",
  24. "Accept-Encoding": "gzip",
  25. "Content-Type": "application/json",
  26. # "x-access-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJsb2dpblR5cGUiOiIxIiwiZXhwIjoxNzU3OTUwODI0LCJ1c2VybmFtZSI6InRpYW56aHUxMDA5QGdtYWlsLmNvbSJ9.PkSn4I2evvlF27OfrxGidT-IwuuTo9nNDukuHSHSs0w",
  27. "country": "1875086144853712897",
  28. "lang": "zh",
  29. "platform": "Android",
  30. "content-type": "application/json; charset=UTF-8"
  31. }
  32. def after_log(retry_state):
  33. """
  34. retry 回调
  35. :param retry_state: RetryCallState 对象
  36. """
  37. # 检查 args 是否存在且不为空
  38. if retry_state.args and len(retry_state.args) > 0:
  39. log = retry_state.args[0] # 获取传入的 logger
  40. else:
  41. log = logger # 使用全局 logger
  42. if retry_state.outcome.failed:
  43. log.warning(
  44. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  45. else:
  46. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  47. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  48. def get_single_page(log, page_no):
  49. log.debug(f"{inspect.currentframe().f_code.co_name} Start get single page, page:{page_no}")
  50. url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/listGoods"
  51. data = {
  52. "sortType": "2",
  53. "containType": ["0", "1"],
  54. "pageNo": page_no,
  55. "pageSize": 15,
  56. "isSellOutShow": "1",
  57. "status": "2"
  58. }
  59. response = requests.post(url, headers=headers, json=data)
  60. # print(response.text)
  61. response.raise_for_status()
  62. if response.status_code == 200:
  63. result = response.json()
  64. if result["success"]:
  65. return result["result"]
  66. else:
  67. log.warning(f"result_message: {result['message']}")
  68. else:
  69. log.warning(f" {inspect.currentframe().f_code.co_name} Request failed with status code: {response.status_code}")
  70. return None
  71. def parse_list_items(log, items, sql_pool):
  72. log.info(f"{inspect.currentframe().f_code.co_name} Start parsing items")
  73. if items:
  74. info_list = []
  75. for item in items:
  76. item_id = item.get('id')
  77. data_dict = {
  78. "item_id": item_id,
  79. "category": category,
  80. "country_name": country_name
  81. }
  82. info_list.append(data_dict)
  83. if info_list:
  84. sql_pool.insert_many(table="hoopi_mall_record", data_list=info_list, ignore=True)
  85. else:
  86. log.warning(f" {inspect.currentframe().f_code.co_name} No items found")
  87. def get_mall_sold_list(log, sql_pool):
  88. page = 1
  89. total_items = 0
  90. # while True:
  91. while page <= max_page:
  92. result = get_single_page(log, page)
  93. if result is None:
  94. break
  95. items = result.get("list", [])
  96. if not items:
  97. log.debug("No items found on page %s", page)
  98. break
  99. try:
  100. parse_list_items(log, items, sql_pool)
  101. except Exception as e:
  102. log.error("Error parsing items on page %s: %s", page, e)
  103. total_items += len(items)
  104. pages = result.get("pages")
  105. total = result.get("total")
  106. # 判断条件 1: 根据 pages 判断
  107. if pages is not None and page >= pages:
  108. log.debug("已爬取 %s 页,共 %s 页" % (page, pages))
  109. break
  110. # 判断条件 2: 根据 list 的长度判断
  111. if len(items) < 15: # pageSize 为 15
  112. log.debug("已获取数据量小于15,停止爬取......................")
  113. break
  114. # 判断条件 3: 根据 total 和已获取数据量判断
  115. if total is not None and total_items >= total:
  116. log.debug("已获取数据量已满足要求,停止爬取......................")
  117. break
  118. page += 1
  119. # time.sleep(random.uniform(0.1, 0.5)) # 添加延时,避免频繁请求
  120. def parse_detail(log, item, sql_pool, item_id):
  121. log.debug("开始解析详情页数据........................")
  122. try:
  123. title = item.get('name')
  124. shopId = item.get('shopId')
  125. shopAppUserId = item.get('shopAppUserId')
  126. shopName = item.get('shopName')
  127. infoImgs = item.get('infoImgs') # 详情图片, 多图, 逗号分隔
  128. cardTypeName = item.get('cardTypeName') # 卡类型
  129. explainIntroduce = item.get('explainIntroduce') # 描述
  130. # 去除表情符号
  131. # if explainIntroduce:
  132. # explainIntroduce = emoji.replace_emoji(explainIntroduce, replace='')
  133. price = item.get('price')
  134. freightPrice = item.get('freightPrice') # 运费
  135. currency = item.get('currency') # 币种
  136. soldCount = item.get('soldCount') # 售出计数
  137. sellOffCount = item.get('sellOffCount') # 抛售计数
  138. status = item.get('status') # 2:售罄
  139. finishTime = item.get('finishTime')
  140. conditionTypeName = item.get('conditionTypeName') # 评级/状况
  141. countryName = item.get('countryName') # 国家
  142. if not countryName:
  143. countryName = country_name
  144. shopSoldCount = item.get('shopSoldCount') # 店铺已售
  145. data_dict = {
  146. 'title': title,
  147. "shop_id": shopId,
  148. 'shop_name': shopName,
  149. 'shop_app_user_id': shopAppUserId,
  150. 'info_imgs': infoImgs,
  151. 'card_type_name': cardTypeName,
  152. 'explain_introduce': explainIntroduce,
  153. 'price': price,
  154. 'freight_price': freightPrice,
  155. 'currency': currency,
  156. 'sold_count': soldCount,
  157. 'sell_off_count': sellOffCount,
  158. 'status': status,
  159. 'finish_time': finishTime,
  160. 'condition_type_name': conditionTypeName,
  161. 'country_name': countryName,
  162. 'shop_sold_count': shopSoldCount,
  163. 'state': 1
  164. }
  165. # print('data_dict:',data_dict)
  166. try:
  167. sql_pool.update_one_or_dict(table='hoopi_mall_record', data=data_dict, condition={'item_id': item_id})
  168. log.success(f"----------------------- 更新成功, item_id: {item_id} -----------------------")
  169. except Exception as e:
  170. log.error(f'解析详情页数据 update_one_or_dict 报错:{e}')
  171. sql_pool.update_one_or_dict(table="hoopi_mall_record", data={"state": 3}, condition={"item_id": item_id})
  172. except Exception as e:
  173. log.error(f'解析详情页数据error, {e}')
  174. sql_pool.update_one_or_dict(table="hoopi_mall_record", data={"state": 3}, condition={"item_id": item_id})
  175. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  176. def get_detail(log, item_id, sql_pool):
  177. log.debug(f"开始获取详情页数据, item_id: {item_id}........................")
  178. # url = "https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/1954822331293704194"
  179. url = f"https://cp.hoopi.xyz/hoopiserver/hoopi/api/goods/getGoodsInfo/{item_id}"
  180. response = requests.post(url, headers=headers, timeout=10)
  181. # print(response.text)
  182. # time.sleep(11111)
  183. response.raise_for_status()
  184. data = response.json()
  185. if data['code'] == 200:
  186. result = data.get("result", {})
  187. parse_detail(log, result, sql_pool, item_id)
  188. else:
  189. log.error(f"获取详情页数据失败, item_id: {item_id}, msg:{data['message']}")
  190. sql_pool.update_one_or_dict(table="hoopi_mall_record", data={"state": 3}, condition={"item_id": item_id})
  191. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  192. def in_mall_card_main(log):
  193. """
  194. 主函数
  195. :param log: logger对象
  196. """
  197. log.info(
  198. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  199. # 配置 MySQL 连接池
  200. sql_pool = MySQLConnectionPool(log=log)
  201. if not sql_pool.check_pool_health():
  202. log.error("数据库连接池异常")
  203. raise RuntimeError("数据库连接池异常")
  204. try:
  205. try:
  206. # 获取已售出商品列表
  207. get_mall_sold_list(log, sql_pool)
  208. # 获取商品详情
  209. sql_ietm_id_list = sql_pool.select_all(f"SELECT item_id FROM hoopi_mall_record WHERE state != 1 AND category = '{category}' AND country_name = '{country_name}'")
  210. sql_ietm_id_list = [item_id[0] for item_id in sql_ietm_id_list]
  211. for item_id in sql_ietm_id_list:
  212. try:
  213. get_detail(log, item_id, sql_pool)
  214. except Exception as e:
  215. log.error(f"Request get_detail error: {e}")
  216. except Exception as e:
  217. log.error(f"Request get_shop_data_list error: {e}")
  218. except Exception as e:
  219. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  220. finally:
  221. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  222. # EmailSender().send(subject="【千岛 拍卖 - 爬虫通知】今日任务已完成",
  223. # content="数据采集和处理已全部完成,请查收结果。\n\n ------ 来自 Python 爬虫系统。")
  224. # def schedule_task():
  225. # """
  226. # 爬虫模块 定时任务 的启动文件
  227. # """
  228. # # 立即运行一次任务
  229. # # in_mall_card_main(log=logger)
  230. #
  231. # # 设置定时任务
  232. # schedule.every().day.at("01:06").do(in_mall_card_main, log=logger)
  233. #
  234. # while True:
  235. # schedule.run_pending()
  236. # time.sleep(1)
  237. if __name__ == '__main__':
  238. # get_mall_sold_list(logger, None)
  239. # sql_pool = MySQLConnectionPool(log=logger)
  240. # get_detail(logger, "1954757558547980290", sql_pool)
  241. in_mall_card_main(logger)