ninja_spider.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/9/8 11:56
  5. import requests
  6. from parsel import Selector
  7. import ninja_settings
  8. import inspect
  9. import time
  10. import schedule
  11. from loguru import logger
  12. from tenacity import retry, stop_after_attempt, wait_fixed
  13. from mysql_pool import MySQLConnectionPool
  14. """
  15. 第二个 页面 年份获取 /html/body/div[3]/span[position() >= 4 and position() <= 18]
  16. """
  17. # logger.remove()
  18. # logger.add("./logs/{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  19. # format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  20. # level="DEBUG", retention="7 day")
  21. def after_log(retry_state):
  22. """
  23. retry 回调
  24. :param retry_state: RetryCallState 对象
  25. """
  26. # 检查 args 是否存在且不为空
  27. if retry_state.args and len(retry_state.args) > 0:
  28. log = retry_state.args[0] # 获取传入的 logger
  29. else:
  30. log = logger # 使用全局 logger
  31. if retry_state.outcome.failed:
  32. log.warning(
  33. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  34. else:
  35. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  36. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  37. def get_proxys(log):
  38. """
  39. 获取代理
  40. :return: 代理
  41. """
  42. tunnel = "x371.kdltps.com:15818"
  43. kdl_username = "t13753103189895"
  44. kdl_password = "o0yefv6z"
  45. try:
  46. proxies = {
  47. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
  48. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
  49. }
  50. return proxies
  51. except Exception as e:
  52. log.error(f"Error getting proxy: {e}")
  53. raise e
  54. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  55. def get_series(log, category_name, category_url, sql_pool):
  56. """
  57. 获取系列
  58. :param log: 日志对象
  59. :param category_name: 类别名称
  60. :param category_url: 类别url
  61. :param sql_pool: 数据库连接池
  62. :return:
  63. """
  64. log.info(f"{inspect.currentframe().f_code.co_name} ->->-> category_name: {category_name}")
  65. # url = "https://www.breakninja.com/baseball_box_break_group_checklists.html"
  66. response = requests.get(category_url, headers=ninja_settings.headers, proxies=get_proxys(log), timeout=22)
  67. response.raise_for_status()
  68. selector = Selector(response.text)
  69. tag_detail_url_list = selector.xpath('//p[@align="left"]/a')
  70. # print(len(tag_detail_url_list))
  71. log.debug(f'{inspect.currentframe().f_code.co_name} tag_detail_url_list size: {len(tag_detail_url_list)}')
  72. # skip_until_target = True # 控制是否跳过直到目标URL
  73. # target_url = '2017-Topps-Inception-Baseball-Most-Popular-Cards-With-Large-Pictures.html'
  74. for tag_detail_url in tag_detail_url_list:
  75. series = tag_detail_url.xpath('./text()').get()
  76. series_url = tag_detail_url.xpath('./@href').get()
  77. # # 如果需要跳过直到目标URL
  78. # if skip_until_target and series_url != target_url:
  79. # log.info(f"跳过 {series_url}")
  80. # continue
  81. #
  82. # # 找到目标URL后,取消跳过标志
  83. # if series_url == target_url:
  84. # skip_until_target = False
  85. # log.info(f"从 {series_url} 开始执行")
  86. # continue # 如果也想跳过目标URL本身,保留这行;如果想包含目标URL,删除这行
  87. if 'http' not in series_url:
  88. series_url = ninja_settings.base_url + series_url
  89. # print(series, series_url)
  90. try:
  91. if 'page_no=' in series_url:
  92. log.debug(f'page_no in team_url, request turn_page.........')
  93. turn_page(log, category_name, series, series_url, sql_pool)
  94. else:
  95. get_table_list(log, category_name, series, series_url, sql_pool)
  96. except Exception as e:
  97. log.error(f"Error processing series: {e}")
  98. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  99. def get_table_list(log, category_name, series, series_url, sql_pool):
  100. """
  101. 获取表单
  102. :param log: 日志对象
  103. :param category_name: 类别名称
  104. :param series: 系列名称
  105. :param series_url: 系列url
  106. :param sql_pool: 数据库连接池
  107. :return:
  108. """
  109. log.info(f"{inspect.currentframe().f_code.co_name} ->->-> series: {series}")
  110. # url = "https://www.breakninja.com/baseball/2025-Topps-All-Star-Game-Mega-Box-Baseball-Checklist.php"
  111. response = requests.get(series_url, headers=ninja_settings.headers, proxies=get_proxys(log), timeout=22)
  112. response.raise_for_status()
  113. selector = Selector(response.text)
  114. tag_team_list = selector.xpath('//div[@class="col-7 col-s-7"]/a | //div[@class="col-3 col-s-3 menu"]/a')
  115. log.debug(f'{inspect.currentframe().f_code.co_name} tag_team_list size: {len(tag_team_list)}')
  116. for tag_team in tag_team_list:
  117. team = tag_team.xpath('./text()').get()
  118. if team in ninja_settings.exclusion_words:
  119. log.debug(f"Skipping <{team}> due to exclusion word")
  120. continue
  121. team_url = tag_team.xpath('./@href').get()
  122. if 'http' not in team_url:
  123. team_url = ninja_settings.base_url + team_url
  124. # print(team, team_url)
  125. log.debug(f'no page_no in team_url, request parse_one_team.........')
  126. get_table_list_2(log, category_name, series, sql_pool, team_url)
  127. def turn_page(log, category_name, series, series_url, sql_pool):
  128. """
  129. 翻页
  130. :param log: 日志对象
  131. :param category_name: 类别名称
  132. :param series: 系列名称
  133. :param series_url: 系列url
  134. :param sql_pool: 数据库连接池
  135. :return:
  136. """
  137. log.info(f"{inspect.currentframe().f_code.co_name} ->->-> series: {series}")
  138. turn_base_url = series_url.split('page_no=')[0] + 'page_no='
  139. page = 1
  140. max_page = 50
  141. while page < max_page + 1:
  142. # url = "https://www.breakninja.com/baseball/2025-Topps-All-Star-Game-Mega-Box-Baseball-Checklist.php?page_no=2"
  143. # 将参数 page_no 的值改为 page 构建新的URL
  144. current_url = turn_base_url + str(page)
  145. log.debug(f'{inspect.currentframe().f_code.co_name} ->->-> page: {page}')
  146. len_data, selector = turn_single_page(log, category_name, series, current_url, sql_pool)
  147. if len_data < 250:
  148. log.debug(f'{inspect.currentframe().f_code.co_name} ->->-> len_data: {len_data}, 少于250条, break!!!')
  149. break
  150. max_page_ = selector.xpath('/html/body/div[3]/div[4]/strong[1]/text()').get()
  151. if max_page_:
  152. max_page = (int(max_page_) / 250) + 1
  153. log.debug(f'{inspect.currentframe().f_code.co_name} ->->-> max_page: {max_page}')
  154. page += 1
  155. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  156. def turn_single_page(log, category_name, series, current_url, sql_pool):
  157. """
  158. 翻页 单页查询
  159. :param log:
  160. :param category_name:
  161. :param series:
  162. :param current_url:
  163. :param sql_pool:
  164. :return:
  165. """
  166. log.info(f"{inspect.currentframe().f_code.co_name} ->->-> series: {series}")
  167. response = requests.get(current_url, headers=ninja_settings.headers, proxies=get_proxys(log), timeout=60)
  168. # print(response.text)
  169. response.raise_for_status()
  170. selector = Selector(response.text)
  171. tag_table = selector.xpath('//table[@class="table table-striped table-bordered"]') # 如果是这种 需要翻页
  172. log.debug(f'{inspect.currentframe().f_code.co_name} tag_table size: {len(tag_table)}')
  173. title = selector.xpath('//h1/text()').get()
  174. if tag_table:
  175. log.debug(f'{inspect.currentframe().f_code.co_name} ->->-> tag_table size 111: {len(tag_table)}')
  176. # len_data = parse_one_team(log, tag_table, category_name, series, sql_pool)
  177. len_data = parse_turn_single_team(log, tag_table, category_name, series, current_url, sql_pool, title)
  178. else:
  179. len_data = 0
  180. return len_data, selector
  181. def parse_turn_single_team(log, tag_table, category_name, series, series_url, sql_pool, title):
  182. # 获取表头(第一行)
  183. head = tag_table.xpath('./thead/tr/th/text()').getall()
  184. # 统一表头字段名:将 "#" 替换为 "Number"
  185. head = ['Number' if h.strip() == '#' else h for h in head]
  186. head = ['Player' if h.strip() == 'Name' else h for h in head]
  187. head = ['Print Run' if h.strip() == 'Run' else h for h in head]
  188. # 获取所有行的数据
  189. rows = tag_table.xpath('./tbody/tr')
  190. data = []
  191. for row in rows:
  192. cells = row.xpath('./td/text()').getall()
  193. cells = [cell.strip() for cell in cells if cell.strip()]
  194. if not cells:
  195. cells = [''] * len(head) # 如果没有数据,用空字符串填充
  196. # 将当前行的数据与表头组合成字典
  197. row_data = dict(zip(head, cells))
  198. # 检查是否所有字段都为空,如果都不为空才添加
  199. if any(value.strip() for value in row_data.values()):
  200. data.append(row_data)
  201. # 打印结果
  202. log.info(f'{inspect.currentframe().f_code.co_name} ->->-> len data size: {len(data)}')
  203. for item in data:
  204. item.update({'category_name': category_name, 'series': series, 'series_url': series_url, 'title': title})
  205. # print(item)
  206. # 处理字段名:转换为小写并替换特殊字符
  207. processed_item = {}
  208. for key, value in item.items():
  209. # 转换为小写,并将空格和特殊字符替换为下划线
  210. new_key = key.lower().replace(' ', '_').replace('-', '_').replace('/', '_').replace('(', '').replace(
  211. ')', '')
  212. processed_item[new_key] = value
  213. print(processed_item)
  214. # try:
  215. # sql_pool.insert_one_or_dict(table='breakninja_checklist_record', data=processed_item)
  216. # except Exception as e:
  217. # log.error(f"insert_one_or_dict data: {e}")
  218. return len(data)
  219. def parse_one_team(log, tag_table, category_name, series, sql_pool, team_url, title):
  220. # 获取表头(第一行)
  221. head = tag_table.xpath('./thead/tr/th/text()').getall()
  222. # 统一表头字段名:将 "#" 替换为 "Number"
  223. head = ['Number' if h.strip() == '#' else h for h in head]
  224. head = ['Player' if h.strip() == 'Name' else h for h in head]
  225. head = ['Print Run' if h.strip() == 'Run' else h for h in head]
  226. # title = tag_table.xpath('./thead/tr/th/text()').getall()
  227. # 获取所有行的数据
  228. rows = tag_table.xpath('./tbody/tr')
  229. data = []
  230. for row in rows:
  231. cells = row.xpath('./td/text()').getall()
  232. cells = [cell.strip() for cell in cells if cell.strip()]
  233. if not cells:
  234. cells = [''] * len(head) # 如果没有数据,用空字符串填充
  235. # 将当前行的数据与表头组合成字典
  236. row_data = dict(zip(head, cells))
  237. # 检查是否所有字段都为空,如果都不为空才添加
  238. if any(value.strip() for value in row_data.values()):
  239. data.append(row_data)
  240. # 打印结果
  241. log.info(f'{inspect.currentframe().f_code.co_name} ->->-> len data size: {len(data)}')
  242. for item in data:
  243. item.update({'category_name': category_name, 'series': series, 'team_url': team_url, 'title': title})
  244. # print(item)
  245. # 处理字段名:转换为小写并替换特殊字符
  246. processed_item = {}
  247. for key, value in item.items():
  248. # 转换为小写,并将空格和特殊字符替换为下划线
  249. new_key = key.lower().replace(' ', '_').replace('-', '_').replace('/', '_').replace('(', '').replace(')',
  250. '')
  251. processed_item[new_key] = value
  252. try:
  253. # print(processed_item)
  254. sql_pool.insert_one_or_dict(table='breakninja_checklist_record', data=processed_item)
  255. except Exception as e:
  256. log.error(f"insert_one_or_dict data: {e}")
  257. return len(data)
  258. def parse_two_team(log, tag_table, category_name, series, sql_pool, team_url, title):
  259. # 获取表头(第一行)
  260. head = tag_table.xpath('.//th/text()').getall()
  261. # 统一表头字段名:将 "#" 替换为 "Number"
  262. head = ['Number' if h.strip() == '#' else h for h in head]
  263. head = ['Player' if h.strip() == 'Name' else h for h in head]
  264. head = ['Print Run' if h.strip() == 'Run' else h for h in head]
  265. # 获取所有行的数据
  266. rows = tag_table.xpath('.//tr')
  267. data = []
  268. for row in rows:
  269. cells = row.xpath('./td/text()').getall()
  270. cells = [cell.strip() for cell in cells if cell.strip()]
  271. if not cells:
  272. cells = [''] * len(head) # 如果没有数据,用空字符串填充
  273. # 将当前行的数据与表头组合成字典
  274. row_data = dict(zip(head, cells))
  275. # 检查是否所有字段都为空,如果都不为空才添加
  276. if any(value.strip() for value in row_data.values()):
  277. data.append(row_data)
  278. # 打印结果
  279. log.info(f'{inspect.currentframe().f_code.co_name} ->->-> len data size: {len(data)}')
  280. for item in data:
  281. item.update({'category_name': category_name, 'series': series, 'team_url': team_url, 'title': title})
  282. # 处理字段名:转换为小写并替换特殊字符
  283. processed_item = {}
  284. for key, value in item.items():
  285. # 转换为小写,并将空格和特殊字符替换为下划线
  286. new_key = key.lower().replace(' ', '_').replace('-', '_').replace('/', '_').replace('(', '').replace(')',
  287. '')
  288. processed_item[new_key] = value
  289. try:
  290. sql_pool.insert_one_or_dict(table='breakninja_checklist_record', data=processed_item)
  291. except Exception as e:
  292. log.error(f"insert_one_or_dict data: {e}")
  293. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  294. def get_table_list_2(log, category_name, series, sql_pool, team_url):
  295. # url = 'https://www.breakninja.com/baseball/2025-Topps-All-Star-Game-Mega-Box-Baseball-Checklist-Arizona-Diamondbacks.php'
  296. # url = 'https://www.breakninja.com/baseball/2025-Topps-All-Star-Game-Mega-Box-Baseball-Checklist-One-Of-Ones.php'
  297. # url = 'https://www.breakninja.com/baseball/2025-Topps-All-Star-Game-Mega-Box-Baseball-Checklist-No-Team.php'
  298. # url = 'https://www.breakninja.com/basketballcards/21-22-Bowman-University-Basketball-Checklist.php?page_no=1'
  299. # url = 'https://www.breakninja.com/baseball/2018-Topps-Archives-Baseball-Checklist-Arizona-Diamondbacks.php'
  300. # url = 'https://www.breakninja.com/basketball/22-23-Panini-Contenders-Basketball-Checklist-Miami-Heat.php'
  301. # response = requests.get(url_, headers=ninja_settings.headers, proxies=get_proxys(log))
  302. log.info(f'{inspect.currentframe().f_code.co_name} ->->-> team_url: {team_url}')
  303. response = requests.get(team_url, headers=ninja_settings.headers, proxies=get_proxys(log), timeout=22)
  304. response.raise_for_status()
  305. selector = Selector(response.text)
  306. title = selector.xpath('//h1/text()').get()
  307. # print(title)
  308. # 有多种页面处理
  309. tag_table = selector.xpath('//table[@class="table table-striped table-bordered"]') # 如果是这种 需要翻页
  310. if tag_table:
  311. if len(tag_table) != 0:
  312. log.debug(f'{inspect.currentframe().f_code.co_name} tag_table size 111: {len(tag_table)}')
  313. parse_one_team(log, tag_table, category_name, series, sql_pool, team_url, title)
  314. else:
  315. tag_table = selector.xpath('//table[@class="table-fill"]')
  316. if len(tag_table) != 0:
  317. log.debug(f'{inspect.currentframe().f_code.co_name} tag_table size 222: {len(tag_table)}')
  318. parse_one_team(log, tag_table, category_name, series, sql_pool, team_url, title)
  319. if not tag_table:
  320. tag_table = selector.xpath('//table[1]')
  321. if len(tag_table) != 0:
  322. log.debug(f'{inspect.currentframe().f_code.co_name} tag_table size 333: {len(tag_table)}')
  323. parse_two_team(log, tag_table, category_name, series, sql_pool, team_url, title)
  324. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  325. def ninja_main(log):
  326. """
  327. 主函数
  328. :param log: logger对象
  329. """
  330. log.info(
  331. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  332. # 配置 MySQL 连接池
  333. sql_pool = MySQLConnectionPool(log=log)
  334. if not sql_pool.check_pool_health():
  335. log.error("数据库连接池异常")
  336. raise RuntimeError("数据库连接池异常")
  337. try:
  338. for ca in ninja_settings.category_list:
  339. category_name = ca.get('category_name')
  340. category_url = ca.get('category_url')
  341. try:
  342. get_series(log, category_name, category_url, sql_pool)
  343. except Exception as e:
  344. log.error(f"Request get_series error: {e}")
  345. except Exception as e:
  346. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  347. finally:
  348. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  349. # EmailSender().send(subject="【 - 爬虫通知】今日任务已完成",
  350. # content="数据采集和处理已全部完成,请查收结果。\n\n ------ 来自 Python 爬虫系统。")
  351. # def schedule_task():
  352. # """
  353. # 爬虫模块 定时任务 的启动文件
  354. # """
  355. # # 立即运行一次任务
  356. # ninja_main(log=logger)
  357. #
  358. # # 设置定时任务
  359. # schedule.every().day.at("00:01").do(ninja_main, log=logger)
  360. #
  361. # while True:
  362. # schedule.run_pending()
  363. # time.sleep(1)
  364. #
  365. if __name__ == '__main__':
  366. # get_table_list(logger)
  367. # get_table_list_2(logger,
  368. # 'https://www.breakninja.com/basketball/22-23-Panini-Contenders-Basketball-Checklist-Miami-Heat.php')
  369. #
  370. # ninja_main(logger)
  371. # turn_page(logger, None, None,
  372. # "https://www.breakninja.com/baseballcards/2024-Topps-Athletes-Unlimited-Pro-Softball-Checklist.php?page_no=4",
  373. # None)
  374. turn_single_page(logger, None, None,
  375. "https://www.breakninja.com/baseballcards/2024-Topps-Athletes-Unlimited-Pro-Softball-Checklist.php?page_no=4",
  376. None)