jhs_app_spider.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/3/17 17:22
  5. import inspect
  6. import schedule
  7. from airtest.core.api import *
  8. from poco.drivers.android.uiautomation import AndroidUiautomationPoco
  9. from loguru import logger
  10. from mysql_pool import MySQLConnectionPool
  11. from tenacity import retry, stop_after_attempt, wait_fixed
  12. # from poco.exceptions import PocoNoSuchNodeException
  13. logger.remove()
  14. logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  15. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  16. level="DEBUG", retention="7 day")
  17. PACKAGE_NAME = 'com.jihuanshe'
  18. init_device("Android")
  19. # 在Airtest中, init_device 和 connect_device 是两个用于初始化设备的函数。
  20. #
  21. # init_device 函数用于初始化设备连接,它会自动检测并连接可用的设备。你可以通过传递设备的参数来指定要连接的设备,例如设备的序列号、设备类型等。
  22. #
  23. # connect_device 函数用于手动连接设备。你需要提供设备的参数,例如设备的序列号、设备类型等,来指定要连接的设备。
  24. #
  25. # 总结起来, init_device 函数会自动检测并连接可用的设备,而 connect_device 函数需要手动指定要连接的设备。
  26. stop_app(PACKAGE_NAME)
  27. # time.sleep(11)
  28. start_app(PACKAGE_NAME)
  29. """
  30. # 如果ADB无法找到设备,尝试重启ADB服务
  31. D:\投屏\QtScrcpy-win-x64-v3.3.1\adb.exe kill-server
  32. """
  33. category_list = [
  34. {'游戏王': ['日文', '简中']},
  35. {'宝可梦': ['简中', '日文', '英文']},
  36. {'航海王': ['简中', '日文']},
  37. {'符文战场': ['简中']},
  38. {'WS黑白双翼': ['日文', '简中']},
  39. {'数码宝贝': ['简中']},
  40. {'迪士尼 洛卡纳': ['简中']},
  41. {'携站之境UA': ['简中', '日文']},
  42. {'VG卡片战斗先导者': ['简中', '日文']},
  43. {'高达GCG': ['简中']},
  44. {'影之诗': ['简中']}
  45. ]
  46. poco = AndroidUiautomationPoco(
  47. use_airtest_input=True, screenshot_each_action=False)
  48. window_width, window_height = poco.get_screen_size()
  49. def scrape_index():
  50. # 等待进去首页
  51. time.sleep(10)
  52. # 判断是否有广告 需要点击
  53. tag_adv = poco(f'{PACKAGE_NAME}:id/ivClose')
  54. for _ in range(3):
  55. if tag_adv.exists():
  56. logger.warning("广告元素已找到")
  57. tag_adv.click()
  58. time.sleep(1)
  59. else:
  60. logger.debug("广告元素未找到, 退出")
  61. break
  62. elements = poco(f'{PACKAGE_NAME}:id/imageIv').wait(60)
  63. # elements.wait_for_appearance(timeout=60)
  64. elements.click()
  65. def after_log(retry_state):
  66. """
  67. retry 回调
  68. :param retry_state: RetryCallState 对象
  69. """
  70. # 检查 args 是否存在且不为空
  71. if retry_state.args and len(retry_state.args) > 0:
  72. log = retry_state.args[0] # 获取传入的 logger
  73. else:
  74. log = logger # 使用全局 logger
  75. if retry_state.outcome.failed:
  76. log.warning(
  77. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  78. else:
  79. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  80. def scrape_detail(element):
  81. try:
  82. # element.click()
  83. # panel = poco_(f'{PACKAGE_NAME}:id/content')
  84. # panel.wait_for_appearance(5)
  85. element_title = element.offspring(f'{PACKAGE_NAME}:id/tvName') # 标题
  86. if element_title.exists():
  87. title = element_title.attr('text')
  88. else:
  89. logger.warning("title元素未找到")
  90. return None
  91. # element_cardImage = element.offspring(f'{PACKAGE_NAME}:id/cardImage')
  92. # if element_cardImage.exists():
  93. # cardImage = element_cardImage.attr('text')
  94. # else:
  95. # logger.warning("cardImage元素未找到")
  96. # return None
  97. element_tvSalesDataNum = element.offspring(f'{PACKAGE_NAME}:id/tvSalesDataNum') # 销量
  98. if element_tvSalesDataNum.exists():
  99. tvSalesDataNum = element_tvSalesDataNum.attr('text')
  100. else:
  101. logger.warning("tvSalesDataNum元素未找到")
  102. ele_tvSalesData = element.offspring(f'{PACKAGE_NAME}:id/tvSalesData')
  103. if ele_tvSalesData.exists():
  104. tvSalesDataNum = ele_tvSalesData.attr('text')
  105. if '张' in tvSalesDataNum:
  106. tvSalesDataNum = tvSalesDataNum.replace('张', 0)
  107. if '销量' in tvSalesDataNum:
  108. tvSalesDataNum = tvSalesDataNum.replace('销量', 0)
  109. logger.debug(f'tvSalesDataNum元素未找到,使用tvSalesData元素')
  110. else:
  111. tvSalesDataNum = None
  112. # return None
  113. element_tvNumber = element.offspring(f'{PACKAGE_NAME}:id/tvNumber') # 编号
  114. if element_tvNumber.exists():
  115. tvNumber = element_tvNumber.attr('text')
  116. else:
  117. logger.warning("tvNumber元素未找到")
  118. return None
  119. element_tvRarity = element.offspring(f'{PACKAGE_NAME}:id/tvRarity') # 稀有度
  120. if element_tvRarity.exists():
  121. tvRarity = element_tvRarity.attr('text')
  122. else:
  123. logger.warning("tvRarity元素未找到")
  124. return None
  125. element_tvCurSalesNum = element.offspring(f'{PACKAGE_NAME}:id/tvCurSalesNum') # 上周期销量
  126. if element_tvCurSalesNum.exists():
  127. tvCurSalesNum = element_tvCurSalesNum.attr('text')
  128. if '张' in tvCurSalesNum:
  129. tvCurSalesNum = tvCurSalesNum.replace('张', '')
  130. else:
  131. logger.warning("tvCurSalesNum元素未找到")
  132. return None
  133. element_tvLastSalesNum = element.offspring(f'{PACKAGE_NAME}:id/tvLastSalesNum') # 本周期销量涨跌幅
  134. if element_tvLastSalesNum.exists():
  135. tvLastSalesNum = element_tvLastSalesNum.attr('text')
  136. else:
  137. logger.warning("tvLastSalesNum元素未找到")
  138. return None
  139. element_tvPrice = element.offspring(f'{PACKAGE_NAME}:id/tvPrice') # 本周期平均集换价
  140. if element_tvPrice.exists():
  141. tvPrice = element_tvPrice.attr('text')
  142. else:
  143. logger.warning("tvPrice元素未找到")
  144. return None
  145. element_tvRank = element.offspring(f'{PACKAGE_NAME}:id/tvRank')
  146. if element_tvRank.exists():
  147. tvRank = element_tvRank.attr('text')
  148. else:
  149. logger.warning("tvRank元素未找到, 查找rankIv元素(前三名)")
  150. element_rankIv = element.offspring(f'{PACKAGE_NAME}:id/rankIv')
  151. if element_rankIv.exists():
  152. tvRank = element_rankIv.attr('text')
  153. logger.debug(f'tvRank元素未找到,使用rankIv元素(前三名)')
  154. logger.debug(tvRank)
  155. else:
  156. tvRank = None
  157. # keyevent('BACK')
  158. data_dict = {
  159. 'title': title,
  160. # 'card_image': cardImage,
  161. 'tv_sales_data_num': tvSalesDataNum,
  162. 'tv_number': tvNumber,
  163. 'tv_rarity': tvRarity,
  164. 'tv_cur_sales_num': tvCurSalesNum,
  165. 'tv_last_sales_num': tvLastSalesNum,
  166. 'tv_price': tvPrice,
  167. 'tv_rank': tvRank
  168. }
  169. # logger.debug(f'data_dict:{data_dict}')
  170. return data_dict
  171. except Exception as e:
  172. logger.error(f"抓取详情时发生错误: {e}")
  173. return None
  174. def parse_sales_data(raw_text):
  175. """
  176. 解析销售数据文本
  177. Args:
  178. raw_text (str): 原始文本数据
  179. Returns:
  180. dict: 解析后的数据字典
  181. """
  182. raw_list = raw_text.split("\n")
  183. # 确保第一个元素是数字,如果不是则插入空字符串
  184. if not raw_list[0].isdigit():
  185. raw_list.insert(0, '')
  186. # 检查列表长度是否足够
  187. if len(raw_list) < 11:
  188. raise ValueError("输入数据不完整")
  189. tv_rank = raw_list[0]
  190. title = raw_list[1]
  191. tv_number = raw_list[2]
  192. tv_rarity = raw_list[3]
  193. tv_cur_sales_num = raw_list[5]
  194. tv_cur_sales_num = tv_cur_sales_num.replace('张', '') if '张' in tv_cur_sales_num else tv_cur_sales_num
  195. # tv_last_sales_num = raw_list[7]
  196. # 增加判断:如果第7行和第8行连续是"本周期销量涨跌幅"和"本周期平均集换价",
  197. # 则在它们之间插入'-',并将tv_last_sales_num设为'-'
  198. if len(raw_list) >= 8 and raw_list[6] == "本周期销量涨跌幅" and raw_list[7] == "本周期平均集换价":
  199. # 在"本周期销量涨跌幅"后插入'-'
  200. raw_list.insert(7, '-')
  201. tv_last_sales_num = '-'
  202. else:
  203. try:
  204. tv_last_sales_num = raw_list[7]
  205. except IndexError:
  206. tv_last_sales_num = ''
  207. tv_price = raw_list[9]
  208. tv_price = tv_price.replace('¥', '') if '¥' in tv_price else tv_price
  209. tv_sales_data_num = raw_list[10]
  210. data_dict = {
  211. 'tv_rank': tv_rank,
  212. 'title': title,
  213. 'tv_number': tv_number,
  214. 'tv_rarity': tv_rarity,
  215. 'tv_cur_sales_num': tv_cur_sales_num,
  216. 'tv_last_sales_num': tv_last_sales_num,
  217. 'tv_price': tv_price,
  218. 'tv_sales_data_num': tv_sales_data_num
  219. }
  220. return data_dict
  221. def scroll_up():
  222. # swipe((window_width * 0.5, window_height * 0.8),
  223. # vector=[0, -0.5], duration=1)
  224. swipe(
  225. v1=(0.5, 0.9), # 起始点(底部)
  226. v2=(0.5, 0.7), # 结束点(顶部)
  227. duration=1.0 # 滑动持续 1 秒(单位:秒)
  228. )
  229. def is_page_changed(old_elements, new_elements):
  230. """检查页面是否成功翻页"""
  231. if not old_elements or not new_elements:
  232. return True
  233. # 比较多个元素来判断页面是否变化
  234. # min_len = min(len(old_elements), len(new_elements), 5) # 比较前5个元素
  235. # for i in range(min_len):
  236. old_title_element = old_elements[-1].offspring(f'{PACKAGE_NAME}:id/tvName')
  237. new_title_element = new_elements[-1].offspring(f'{PACKAGE_NAME}:id/tvName')
  238. if old_title_element.exists() and new_title_element.exists():
  239. old_title = old_title_element.attr('text')
  240. new_title = new_title_element.attr('text')
  241. logger.debug(f'old_title: {old_title}, new_title: {new_title}')
  242. if old_title != new_title:
  243. return True # 找到不同元素,页面已变化
  244. else:
  245. logger.debug(f'old_title: {old_title}, new_title: {new_title}')
  246. return False
  247. else:
  248. logger.warning("title元素未找到")
  249. return True
  250. def get_data(sql_pool):
  251. """
  252. 获取数据
  253. :param sql_pool: MySQL连接池对象
  254. """
  255. scrape_index()
  256. for ca_idx, category_value in enumerate(category_list):
  257. category = list(category_value.keys())[0]
  258. lang_list = list(category_value.values())[0]
  259. logger.debug(
  260. f'当前分类索引: {ca_idx}, 分类名称: {category}, 语言列表: {lang_list} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
  261. for lang_idx, language in enumerate(lang_list):
  262. # 点击下拉框列表
  263. # tag_drop_down = poco_(f'{PACKAGE_NAME}:id/tvName')
  264. tag_drop_down = \
  265. poco("android:id/content").child("android.widget.FrameLayout").child(
  266. "android.widget.FrameLayout").child(
  267. "android.view.View").child("android.view.View").child("android.view.View").child(
  268. "android.widget.Button")[1]
  269. tag_drop_down.click()
  270. # 获取集换榜
  271. tag_bang_list = poco("android:id/content").child("android.widget.FrameLayout").child(
  272. "android.widget.FrameLayout").child("android.view.View").child("android.view.View").child(
  273. "android.view.View").child("android.widget.ScrollView")
  274. # tag_bang_list.wait_for_appearance(timeout=60)
  275. if not tag_bang_list.exists():
  276. logger.warning("tag_bang_list元素未找到")
  277. else:
  278. # 断连重跑
  279. # if tag_idx > 7:
  280. # logger.warning("已断连重跑")
  281. # 在 tag_bang_list 循环内部,当 ca_idx > 7 时,需要滑动下拉菜单
  282. if ca_idx > 8:
  283. logger.debug('滑动下拉菜单')
  284. # tag_drop_down.click()
  285. poco("android:id/content").child("android.widget.FrameLayout").child(
  286. "android.widget.FrameLayout").child("android.view.View").child("android.view.View").child(
  287. "android.view.View").child("android.widget.ScrollView").child("android.widget.Button")[
  288. 7].swipe(
  289. [0.0035, -0.3152])
  290. time.sleep(2)
  291. poco("android:id/content").child("android.widget.FrameLayout").child(
  292. "android.widget.FrameLayout").child("android.view.View").child("android.view.View").child(
  293. "android.view.View").child("android.widget.ScrollView").child("android.widget.Button")[
  294. ca_idx - 3].click()
  295. logger.info(f'点击 集换榜分类 -> {category}')
  296. else:
  297. poco("android:id/content").child("android.widget.FrameLayout").child(
  298. "android.widget.FrameLayout").child("android.view.View").child("android.view.View").child(
  299. "android.view.View").child("android.widget.ScrollView").child("android.widget.Button")[
  300. ca_idx].click()
  301. logger.info(f'点击 集换榜分类 -> {category}')
  302. # tag_bang.wait_for_appearance(timeout=60)
  303. # time.sleep(2)
  304. # 点击语言分类
  305. logger.info(f'当前语言名称: {language} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
  306. # tag_lang_list = poco_("android:id/content").child("android.widget.FrameLayout").child(
  307. # "android.widget.FrameLayout").child("android.view.View").child("android.view.View").child(
  308. # "android.view.View").child("android.view.View")[1].child("android.widget.Button")[lang_idx]
  309. tag_lang_list = poco(language)
  310. tag_lang_list.click()
  311. # time.sleep(2)
  312. # 点击畅销榜
  313. touch(Template(r"tpl1761026966680.png", record_pos=(0.048, -0.781), resolution=(1080, 2340)))
  314. # try:
  315. # # 定位滚动容器
  316. # scroll_view = poco_("android:id/content").child("android.widget.FrameLayout").child("android.widget.FrameLayout").child("android.view.View").child("android.view.View").child("android.view.View").child("android.view.View")[0]
  317. # # print(scroll_view.attr("text"))
  318. #
  319. # # 方法1:优先用文本匹配(最直观)
  320. # target = scroll_view.offspring(descContains="畅销榜")
  321. # if target.exists():
  322. # target.click()
  323. # else:
  324. # print("未找到“畅销榜”标签")
  325. #
  326. # except PocoNoSuchNodeException as e:
  327. # print("找不到指定的 UI 路径:", e)
  328. time.sleep(2)
  329. # 数据列表 -> 翻页
  330. stop_page = False
  331. # previous_elements = None
  332. page_count = 1
  333. max_pages = 30
  334. data_list = []
  335. while page_count <= max_pages and not stop_page:
  336. if page_count > max_pages:
  337. logger.warning("已达到最大翻页次数,停止翻页")
  338. break
  339. # 翻页操作
  340. if page_count == 1:
  341. logger.debug("第一页开始翻页")
  342. scroll_up()
  343. # scroll_up()
  344. else:
  345. logger.debug(f"{page_count}翻页.........")
  346. scroll_up()
  347. scroll_up()
  348. scroll_up()
  349. scroll_up()
  350. tag_list_view = poco('android.widget.ScrollView')
  351. # tag_list_view.wait_for_appearance(timeout=60)
  352. elements = tag_list_view.offspring('android.widget.Button')
  353. # current_elements = list(elements)[1:]
  354. for element in elements:
  355. element_title = element.child()
  356. if not element_title.exists():
  357. continue
  358. # element_data = scrape_detail(element)
  359. tag_element_data = element_title.attr('name')
  360. # print(f'element_data:{tag_element_data}')
  361. try:
  362. if '加入心愿单' not in tag_element_data:
  363. element_data = parse_sales_data(tag_element_data)
  364. logger.info(element_data)
  365. if element_data:
  366. filtered_data = {k: v for k, v in element_data.items() if k != 'tv_rank'}
  367. filtered_data.update({
  368. 'category': category,
  369. 'language': language,
  370. 'crawler_date': time.strftime("%Y-%m-%d", time.localtime())
  371. })
  372. logger.debug(f'scraped data: {filtered_data}')
  373. data_list.append(filtered_data)
  374. # 检查排名是否为100,如果是则停止翻页
  375. if element_data.get('tv_rank') == '100':
  376. logger.success(
  377. f'已获取第 {page_count} 页数据, 第 {element_data["tv_rank"]} 名, 停止翻页....')
  378. stop_page = True
  379. break
  380. else:
  381. logger.warning("未获取到数据")
  382. else:
  383. logger.debug('<加入心愿单> 字样在tag_element_data, 跳过')
  384. except ValueError as e:
  385. logger.error(f"数据解析错误: {e}")
  386. if stop_page:
  387. logger.debug(f'已获取 {len(data_list)} 条数据, 停止翻页....')
  388. break
  389. page_count += 1
  390. time.sleep(1)
  391. logger.success(
  392. '---------------------------------------------------------------------------------------------')
  393. # 保存数据
  394. # 替换掉原来的 list(set(...)) 实现去重
  395. seen = set()
  396. unique_data_list = []
  397. for item in data_list:
  398. frozen_item = frozenset(item.items())
  399. if frozen_item not in seen:
  400. seen.add(frozen_item)
  401. unique_data_list.append(item)
  402. data_list = unique_data_list
  403. sql_pool.insert_many(table='jhs_bestseller_record', data_list=data_list, ignore=True)
  404. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  405. def jhs_app_bestseller_main(log):
  406. """
  407. 主函数
  408. :param log: logger对象
  409. """
  410. log.info(
  411. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  412. # 配置 MySQL 连接池
  413. sql_pool = MySQLConnectionPool(log=log)
  414. if not sql_pool:
  415. log.error("MySQL数据库连接失败")
  416. raise Exception("MySQL数据库连接失败")
  417. try:
  418. get_data(sql_pool)
  419. # stop_app(PACKAGE_NAME)
  420. except Exception as e:
  421. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  422. finally:
  423. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  424. if __name__ == '__main__':
  425. # schedule_task()
  426. jhs_app_bestseller_main(logger)