psa_pop_detail_spider.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/9/15 14:53
  5. import inspect
  6. import random
  7. import time
  8. import schedule
  9. import user_agent
  10. from curl_cffi import requests
  11. from loguru import logger
  12. from parsel import Selector
  13. from urllib.parse import urljoin
  14. from mysql_pool import MySQLConnectionPool
  15. from DrissionPage import ChromiumPage, ChromiumOptions
  16. from tenacity import retry, stop_after_attempt, wait_fixed
  17. logger.remove()
  18. logger.add("logs/pop_player_{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  19. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  20. level="DEBUG", retention="3 day")
  21. client_identifier_list = [
  22. "edge99", "edge101", "chrome99", "chrome100", "chrome101", "chrome104", "chrome107",
  23. "chrome110", "chrome116", "chrome119", "chrome120", "chrome123", "chrome124",
  24. "chrome99_android", "safari15_3", "safari15_5", "safari17_0", "safari17_2_ios"
  25. ]
  26. BASE_URL = 'https://www.psacard.com'
  27. headers = {
  28. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  29. 'User-Agent': user_agent.generate_user_agent()
  30. }
  31. category_link_list = {
  32. 'Baseball Cards': 'https://www.psacard.com/pop/baseball-cards/20003',
  33. 'Baseball Coins': 'https://www.psacard.com/pop/baseball-coins/82797',
  34. 'Basketball Cards': 'https://www.psacard.com/pop/basketball-cards/20019',
  35. 'Basketball Coins': 'https://www.psacard.com/pop/basketball-coins/83007',
  36. 'Boxing / Wrestling Cards / MMA': 'https://www.psacard.com/pop/boxing-wrestling-cards-mma/20021',
  37. 'Football Cards': 'https://www.psacard.com/pop/football-cards/20014',
  38. 'Football Coins': 'https://www.psacard.com/pop/football-coins/83011',
  39. 'Golf Cards': 'https://www.psacard.com/pop/golf-cards/20023',
  40. 'Hockey Cards': 'https://www.psacard.com/pop/hockey-cards/20020',
  41. 'Hockey Coins': 'https://www.psacard.com/pop/hockey-coins/83012',
  42. 'Minor League Cards': 'https://www.psacard.com/pop/minor-league-cards/20031',
  43. 'Misc Cards': 'https://www.psacard.com/pop/misc-cards/20033',
  44. 'Multi-Sport Cards': 'https://www.psacard.com/pop/multi-sport-cards/20006',
  45. 'Multi-Sport Coins': 'https://www.psacard.com/pop/multi-sport-coins/102825',
  46. 'Non-Sport Cards': 'https://www.psacard.com/pop/non-sport-cards/20032',
  47. 'Non-Sport Coins': 'https://www.psacard.com/pop/non-sport-coins/82981',
  48. 'Packs': 'https://www.psacard.com/pop/packs/20017',
  49. 'Pins': 'https://www.psacard.com/pop/pins/20013',
  50. 'Soccer Cards': 'https://www.psacard.com/pop/soccer-cards/20004',
  51. 'TCG Cards': 'https://www.psacard.com/pop/tcg-cards/156940',
  52. 'Tickets': 'https://www.psacard.com/pop/tickets/20022',
  53. # 'Game-Used Bats': 'https://www.psacard.com/pop/bats'
  54. }
  55. def after_log(retry_state):
  56. """
  57. retry 回调
  58. :param retry_state: RetryCallState 对象
  59. """
  60. # 检查 args 是否存在且不为空
  61. if retry_state.args and len(retry_state.args) > 0:
  62. log = retry_state.args[0] # 获取传入的 logger
  63. else:
  64. log = logger # 使用全局 logger
  65. if retry_state.outcome.failed:
  66. log.warning(
  67. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  68. else:
  69. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  70. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  71. def get_proxys(log):
  72. # 已购买账户 北美
  73. # http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
  74. # https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
  75. http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  76. https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  77. try:
  78. proxySettings = {
  79. "http": http_proxy,
  80. "https": https_proxy,
  81. }
  82. return proxySettings
  83. except Exception as e:
  84. log.error(f"Error getting proxy: {e}")
  85. raise e
  86. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  87. def get_sets_data(log, category, category_id, category_link, tag_year, tag_year_link, sql_pool):
  88. # 隧道域名:端口号
  89. # tunnel = "x371.kdltps.com:15818"
  90. tunnel = "proxy.123proxy.cn:36927"
  91. options = ChromiumOptions()
  92. # options.set_paths(local_port=9137, user_data_path=r'D:\Drissionpage_temp\topps1_port_9137')
  93. # options = ChromiumOptions()
  94. options.set_proxy("http://" + tunnel)
  95. options.auto_port(True)
  96. # options.headless(True)
  97. options.set_argument("--disable-gpu")
  98. options.set_argument("-accept-lang=en-US")
  99. options.set_argument('--start-maximized')
  100. tab = ChromiumPage(options)
  101. try:
  102. # tab_url = f"https://www.psacard.com/pop/baseball-cards/2024/260045"
  103. tab_url = tag_year_link
  104. tab.get(tab_url)
  105. log.debug(f'{inspect.currentframe().f_code.co_name} -> 页面加载成功, url: {tab_url}')
  106. # time.sleep(11111)
  107. # 循环翻页直到最后一页
  108. page = 1
  109. while True:
  110. log.debug(f'{inspect.currentframe().f_code.co_name} -> 当前页码: {page}')
  111. html = tab.html
  112. if not html:
  113. log.error(f'{inspect.currentframe().f_code.co_name} -> 页面加载失败...........')
  114. raise Exception('页面加载失败, 重新加载........')
  115. selector = Selector(text=html)
  116. tag_tr_list = selector.xpath('//table[@id="tableSets"]/tbody/tr[position() > 1]')
  117. info_list = []
  118. for tag_tr in tag_tr_list:
  119. set_name = tag_tr.xpath('./td[@class="text-left"]/a[1]/text()').get()
  120. set_name_url = tag_tr.xpath('./td[@class="text-left"]/a[1]/@href').get()
  121. set_name_url = urljoin(BASE_URL, set_name_url) if set_name_url else None
  122. set_id = set_name_url.split('/')[-1] if set_name_url else None
  123. # print(set_name, set_name_url)
  124. data_dict = {
  125. 'category': category,
  126. 'category_id': category_id,
  127. 'category_link': category_link,
  128. 'year': tag_year,
  129. 'year_link': tag_year_link,
  130. 'set_name': set_name,
  131. 'set_link': set_name_url,
  132. 'set_id': set_id
  133. }
  134. info_list.append(data_dict)
  135. # try:
  136. # get_player_list(log, data_dict, sql_pool)
  137. # except Exception as e:
  138. # log.error(f'get_player_list error: {e}')
  139. # 保存数据
  140. if info_list:
  141. sql_pool.insert_many(table='psa_pop_player_sets', data_list=info_list, ignore=True)
  142. # 检查是否还有下一页
  143. next_button = tab.ele('#tableSets_next')
  144. if next_button and 'disabled' not in next_button.attr('class'):
  145. # 点击下一页按钮
  146. next_button.click()
  147. log.debug(f'{inspect.currentframe().f_code.co_name} -> 点击下一页按钮')
  148. # tab.wait.load_start() # 等待页面加载
  149. # time.sleep(2) # 等待页面加载
  150. else:
  151. # 没有下一页了,退出循环
  152. log.debug(f'{inspect.currentframe().f_code.co_name} -> 没有下一页了, 退出循环, 最后页码: {page}')
  153. break
  154. page += 1
  155. except Exception as e:
  156. log.error(f'get_response error: {e}')
  157. raise 'get_response error'
  158. finally:
  159. tab.quit()
  160. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  161. def get_years_data(log, category, category_id, category_link, sql_pool):
  162. """
  163. 获取详情数据
  164. :param log:
  165. :param category:
  166. :param category_id:
  167. :param category_link:
  168. :param sql_pool:
  169. """
  170. try:
  171. with requests.Session() as session:
  172. resp = session.get(category_link, impersonate=random.choice(client_identifier_list), headers=headers,
  173. proxies=get_proxys(log), timeout=22, allow_redirects=False)
  174. # log.debug(resp.text)
  175. if 'Just a moment' in resp.text:
  176. log.debug('Just a moment , retrying.....')
  177. raise Exception('Just a moment')
  178. resp_selector = Selector(text=resp.text)
  179. tag_tr_list = resp_selector.xpath('//table[@id="tableCategory"]/tbody/tr')
  180. for tag_tr in tag_tr_list:
  181. tag_year = tag_tr.xpath('./td[1]/a/text()').get()
  182. tag_year_link = tag_tr.xpath('./td[1]/a/@href').get()
  183. tag_year_link = BASE_URL + tag_year_link if tag_year_link else tag_year_link
  184. try:
  185. get_sets_data(log, category, category_id, category_link, tag_year, tag_year_link, sql_pool)
  186. except Exception as e1:
  187. log.error(f"Error getting sets data: {e1}")
  188. except Exception as e:
  189. log.error(f"Error getting detail data: {e}")
  190. raise e
  191. @retry(stop=stop_after_attempt(5), wait=wait_fixed(3), after=after_log)
  192. def get_player_single_page(log, category_id, set_id, start, length, draw):
  193. """
  194. 获取单页球员数据
  195. :param log: 日志对象
  196. :param category_id: 分类ID
  197. :param set_id: 集合ID
  198. :param start: 起始位置
  199. :param length: 数据长度
  200. :param draw: 请求序号
  201. :return: 响应数据字典或None
  202. """
  203. player_headers = {
  204. "accept": "application/json, text/javascript, */*; q=0.01",
  205. "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
  206. "referer": f"https://www.psacard.com/pop/baseball-cards/2024/bowman/{set_id}",
  207. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
  208. }
  209. url = "https://www.psacard.com/Pop/GetSetItems"
  210. data = {
  211. "draw": str(draw),
  212. "start": str(start),
  213. "length": str(length),
  214. "search": "",
  215. "headingID": str(set_id),
  216. "categoryID": str(category_id),
  217. "isPSADNA": "false"
  218. }
  219. try:
  220. response = requests.post(
  221. url,
  222. impersonate=random.choice(client_identifier_list),
  223. headers=player_headers,
  224. data=data,
  225. timeout=22,
  226. proxies=get_proxys(log)
  227. )
  228. if response.status_code == 200:
  229. # print(response.json())
  230. return response.json()
  231. elif response.status_code == 403:
  232. log.error(f"请求被拒绝,请检查IP地址是否被封禁, set_id: {set_id}")
  233. # return None
  234. raise Exception('请求被拒绝')
  235. else:
  236. log.error(f"请求失败,状态码: {response.status_code}, set_id: {set_id}")
  237. # return None
  238. raise Exception('请求失败')
  239. except Exception as e:
  240. log.error(f"获取单页球员数据出错, set_id: {set_id}, start: {start}, error: {e}")
  241. # return None
  242. raise Exception('获取单页球员数据出错')
  243. def get_player_list(log, category_id, set_id, sql_pool):
  244. """
  245. 获取球员列表数据(带翻页功能)
  246. :param log: 日志对象
  247. :param category_id: category_id
  248. :param set_id: set_id
  249. :param sql_pool: 数据库连接池
  250. """
  251. start = 0 # 起始位置
  252. length = 300 # 每页数量
  253. draw = 1 # 请求序列号
  254. while True:
  255. log.debug(f"正在获取球员数据, category_id:{category_id}, set_id: {set_id}, page: {draw}, start: {start}")
  256. # 获取单页数据
  257. try:
  258. response_data = get_player_single_page(log, category_id, set_id, start, length, draw)
  259. except Exception as e:
  260. log.error(f"获取单页球员数据出错, category_id:{category_id}, set_id: {set_id}, start: {start}, error: {e}")
  261. response_data = None
  262. if response_data is None:
  263. log.error(f"获取球员数据失败, category_id:{category_id}, set_id: {set_id}, page: {draw}, break !!!")
  264. # sql_pool.update_one_or_dict(
  265. # table='psa_pop_player_sets',
  266. # data={'player_state': 2},
  267. # condition={'set_id': set_id}
  268. # )
  269. break
  270. # 检查是否有数据返回
  271. player_data_list = response_data.get('data')
  272. # 如果是第一页 删除第一条
  273. if draw == 1:
  274. player_data_list.pop(0)
  275. if len(player_data_list) > 0:
  276. log.debug(
  277. f"获取到 {len(player_data_list)} 条球员数据, category_id:{category_id}, set_id: {set_id}, start: {start}")
  278. info_list = []
  279. for pl_data in player_data_list:
  280. spec_id = pl_data.get('SpecID')
  281. card_number = pl_data.get('CardNumber')
  282. subject_name = pl_data.get('SubjectName')
  283. card_set = pl_data.get('Variety')
  284. grade_total = pl_data.get('GradeTotal')
  285. data_dict = {
  286. 'category_id': category_id,
  287. 'set_id': set_id,
  288. 'spec_id': spec_id,
  289. 'card_number': card_number,
  290. 'subject_name': subject_name,
  291. 'card_set': card_set,
  292. 'grade_total': grade_total
  293. }
  294. # print(f'data_dict:{data_dict}')
  295. info_list.append(data_dict)
  296. # 保存数据到数据库
  297. if info_list:
  298. sql_pool.insert_many(table='psa_pop_player_record', data_list=info_list, ignore=True)
  299. # 如果返回数据少于请求长度,说明已是最后一页
  300. if len(player_data_list) < length:
  301. log.debug(f"已到达最后一页, set_id: {set_id}")
  302. break
  303. # 更新参数准备下一页
  304. start += length
  305. draw += 1
  306. # 添加延迟避免请求过于频繁
  307. time.sleep(1)
  308. else:
  309. log.debug(f"没有更多数据, set_id: {set_id}")
  310. break
  311. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  312. def pop_main(log):
  313. """
  314. 主函数
  315. """
  316. log.info(
  317. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  318. # 配置 MySQL 连接池
  319. sql_pool = MySQLConnectionPool(log=log)
  320. if not sql_pool.check_pool_health():
  321. log.error("数据库连接池异常")
  322. raise RuntimeError("数据库连接池异常")
  323. try:
  324. log.debug(".......... 开始获取数据报告 ..........")
  325. for category, category_link in category_link_list.items():
  326. log.debug(f"{category}第一次查询, 开始获取数据.......")
  327. try:
  328. category_id = category_link.split('/')[-1]
  329. get_years_data(log, category, category_id, category_link, sql_pool)
  330. except Exception as e1:
  331. log.error(f"Error getting detail data: {e1}")
  332. except Exception as e:
  333. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  334. finally:
  335. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  336. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  337. def player_main(log):
  338. """
  339. 主函数
  340. """
  341. log.info(
  342. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  343. # 配置 MySQL 连接池
  344. sql_pool = MySQLConnectionPool(log=log)
  345. if not sql_pool.check_pool_health():
  346. log.error("数据库连接池异常")
  347. raise RuntimeError("数据库连接池异常")
  348. try:
  349. log.debug(".......... 开始获取数据报告 ..........")
  350. sql_sets_list = sql_pool.select_all(
  351. # "select category_id, set_id from psa_pop_player_sets where player_state = 0"
  352. "select category_id, set_id from psa_pop_player_sets")
  353. for category_set in sql_sets_list:
  354. category_id, set_id = category_set
  355. try:
  356. log.debug(f"category_id:{category_id}第一次查询, 开始获取数据.......")
  357. get_player_list(log, category_id, set_id, sql_pool)
  358. # sql_pool.update_one_or_dict(
  359. # table='psa_pop_player_sets',
  360. # data={'player_state': 1},
  361. # condition={'set_id': set_id}
  362. # )
  363. except Exception as e1:
  364. log.error(f"Error getting detail data: {e1}")
  365. # sql_pool.update_one_or_dict(
  366. # table='psa_pop_player_sets',
  367. # data={'player_state': 2},
  368. # condition={'set_id': set_id}
  369. # )
  370. except Exception as e:
  371. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  372. finally:
  373. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  374. def schedule_task():
  375. """
  376. 两个爬虫模块的启动文件
  377. bidding_main
  378. weika_change_card_by_id_spider
  379. change_card_main
  380. """
  381. # 立即运行一次任务
  382. # pop_main(log=logger)
  383. # player_main(log=logger)
  384. # 设置定时任务
  385. schedule.every().saturday.at("08:00").do(pop_main, log=logger)
  386. schedule.every().wednesday.at("08:00").do(player_main, log=logger)
  387. # schedule.every().day.at("00:30").do(player_main, log=logger)
  388. while True:
  389. schedule.run_pending()
  390. time.sleep(1)
  391. if __name__ == '__main__':
  392. # get_detail_data(logger, '','https://www.psacard.com/pop/bats',None)
  393. # get_sets_data(logger)
  394. # get_player_single_page(logger, '21172', '279481', 0, 300, 1)
  395. # aa_dict = {
  396. # 'category': 'baseball',
  397. # 'category_id': '20003',
  398. # 'category_link': 'https://www.psacard.com/pop/bats',
  399. # 'tag_year': '2004',
  400. # 'tag_year_link': '',
  401. # 'set_name': '',
  402. # 'set_name_url': '',
  403. # 'set_id': '279664'
  404. # }
  405. # get_player_list(logger, aa_dict, None)
  406. # pop_main(log=logger)
  407. player_main(log=logger)
  408. # schedule_task()