cgc_cards_spider.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. # -*- coding: utf-8 -*-
  2. # Author : Charley
  3. # Python : 3.10.8
  4. # Date : 2025/9/22 19:06
  5. import time
  6. import inspect
  7. import requests
  8. import schedule
  9. import user_agent
  10. from loguru import logger
  11. from parsel import Selector
  12. from tenacity import retry, stop_after_attempt, wait_fixed
  13. from mysql_pool import MySQLConnectionPool
  14. logger.remove()
  15. logger.add("./logs/new_{time:YYYYMMDD}.log", encoding='utf-8', rotation="00:00",
  16. format="[{time:YYYY-MM-DD HH:mm:ss.SSS}] {level} {message}",
  17. level="DEBUG", retention="7 day")
  18. """
  19. cgc先跑6000+6位数——6075+6位数
  20. """
  21. HEADERS = {
  22. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  23. "user-agent": user_agent.generate_user_agent()
  24. }
  25. COOKIES = {
  26. "CaptchaValue": "eyJhbGdvcml0aG0iOiJTSEEtMjU2IiwiY2hhbGxlbmdlIjoiNTZlNTBmMzU4MWRlMGYyNGQ5OGI2MTBhYTJlMTdmNjIzNzU4MDdmZjExOWM2MjNjYTRhNzUyY2MxMmU3ZDNmYyIsIm51bWJlciI6NDUxMTcsInNhbHQiOiJmNWU0ODBkZWNhMDNmYmJmNzdiN2UyYmYiLCJzaWduYXR1cmUiOiI3ZjlkMTc3NjllOTMwM2I3NTM5OTRlNzRlZDg0MzU3NTZkMzljMzU5YTFhMzBmODAzODNlMWI4YjA0MGZhZDVjIiwidG9vayI6ODJ9",
  27. "saved-language": "zh-CN",
  28. "SessionID": "dd60b7e4-f638-41cd-bb56-e2e4929aee27",
  29. "_ga": "GA1.1.706688626.1757584379",
  30. "AltchaSessionID": "1622a445-bf77-43e5-be67-13b76ff8e5ca",
  31. "_ga_55FF3CQQK2": "GS2.1.s1758539017$o6$g0$t1758539017$j60$l0$h0"
  32. }
  33. def after_log(retry_state):
  34. """
  35. retry 回调
  36. :param retry_state: RetryCallState 对象
  37. """
  38. # 检查 args 是否存在且不为空
  39. if retry_state.args and len(retry_state.args) > 0:
  40. log = retry_state.args[0] # 获取传入的 logger
  41. else:
  42. log = logger # 使用全局 logger
  43. if retry_state.outcome.failed:
  44. log.warning(
  45. f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} Times")
  46. else:
  47. log.info(f"Function '{retry_state.fn.__name__}', Attempt {retry_state.attempt_number} succeeded")
  48. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  49. def get_kdlproxys(log):
  50. """
  51. 获取代理
  52. :return: 代理
  53. """
  54. tunnel = "x371.kdltps.com:15818"
  55. kdl_username = "t13753103189895"
  56. kdl_password = "o0yefv6z"
  57. try:
  58. proxies = {
  59. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel},
  60. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": kdl_username, "pwd": kdl_password, "proxy": tunnel}
  61. }
  62. return proxies
  63. except Exception as e:
  64. log.error(f"Error getting proxy: {e}")
  65. raise e
  66. @retry(stop=stop_after_attempt(5), wait=wait_fixed(2), after=after_log)
  67. def get_proxys(log):
  68. # 已购买账户 北美
  69. # http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
  70. # https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36927"
  71. http_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  72. https_proxy = "http://u1952150085001297:sJMHl4qc4bM0@proxy.123proxy.cn:36931"
  73. # url = "https://ifconfig.me"
  74. try:
  75. proxySettings = {
  76. "http": http_proxy,
  77. "https": https_proxy,
  78. }
  79. return proxySettings
  80. except Exception as e:
  81. log.error(f"Error getting proxy: {e}")
  82. raise e
  83. @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), after=after_log)
  84. def extract_card_info(log, url, cookies=None):
  85. """
  86. 从指定URL提取卡片信息
  87. :param log: logger对象
  88. :param url: 卡片详情页URL
  89. :param cookies: 请求cookies
  90. :return: 包含卡片信息的字典
  91. """
  92. log.debug(f"开始获取 {url} 页面信息.......................")
  93. if cookies is None:
  94. cookies = COOKIES
  95. try:
  96. response = requests.get(url, headers=HEADERS, cookies=cookies, timeout=10, proxies=get_kdlproxys(log))
  97. log.debug(f'status_code: {response.status_code}')
  98. response.raise_for_status() # 检查请求是否成功
  99. except requests.RequestException as e:
  100. logger.error(f"请求失败: {e}")
  101. return 2
  102. # 经排查 正常号码段 返回源码中也有此字样
  103. # if '找不到这个项目。请检查CGC评级号码是否正确。' in response.text:
  104. # log.warning(f"{url} 获取失败, 找不到这个项目。请检查CGC评级号码是否正确。")
  105. # return 3
  106. selector = Selector(response.text)
  107. tag_dl_list = selector.xpath('//div[@class="results-pane"]/div[@class="certlookup-intro"]//dl')
  108. # print(tag_dl_list)
  109. if not tag_dl_list:
  110. log.warning(f"{url} 获取失败, 找不到dl标签")
  111. return 3
  112. # 初始化变量
  113. data_dict = {
  114. "rating_number": None,
  115. "card_name":None,
  116. "year": None,
  117. "manufacturer": None,
  118. "card_set": None,
  119. "card_no": None,
  120. "player": None,
  121. # "belonging": None,
  122. "grade": None,
  123. "image_front": None,
  124. "image_back": None
  125. }
  126. # 提取详细信息
  127. for tag_dl in tag_dl_list:
  128. dt_text = tag_dl.xpath('./dt/text()').get()
  129. dd_text = tag_dl.xpath('./dd/text()').get()
  130. # print(dt_text, dd_text)
  131. if dt_text == "评级号码":
  132. data_dict["rating_number"] = dd_text.strip() if dd_text else None
  133. elif dt_text == "卡牌名称":
  134. data_dict["card_name"] = dd_text.strip() if dd_text else None
  135. elif dt_text == "年份":
  136. data_dict["year"] = dd_text
  137. elif dt_text == "制造商":
  138. data_dict["manufacturer"] = dd_text
  139. elif dt_text == "套装":
  140. data_dict["card_set"] = dd_text
  141. elif dt_text == "卡牌编号":
  142. data_dict["card_no"] = dd_text
  143. elif dt_text == "球员":
  144. data_dict["player"] = dd_text
  145. # elif dt_text == "归属":
  146. # data_dict["belonging"] = dd_text
  147. elif dt_text == "评级等级":
  148. data_dict["grade"] = dd_text.strip() if dd_text else None
  149. # 提取图片链接
  150. tag_img_list = selector.xpath('//div[@class="results-pane"]//div[@class="certlookup-images-item"]/a/@href').getall()
  151. if len(tag_img_list) >= 2:
  152. data_dict["image_front"] = tag_img_list[0]
  153. data_dict["image_back"] = tag_img_list[1]
  154. elif len(tag_img_list) == 1:
  155. data_dict["image_front"] = tag_img_list[0]
  156. # print(data_dict)
  157. return data_dict
  158. @retry(stop=stop_after_attempt(100), wait=wait_fixed(3600), after=after_log)
  159. def cgc_card_main(log):
  160. """
  161. 主函数
  162. :param log: logger对象
  163. """
  164. log.info(
  165. f'开始运行 {inspect.currentframe().f_code.co_name} 爬虫任务....................................................')
  166. # 配置 MySQL 连接池
  167. sql_pool = MySQLConnectionPool(log=log)
  168. if not sql_pool.check_pool_health():
  169. log.error("数据库连接池异常")
  170. raise RuntimeError("数据库连接池异常")
  171. try:
  172. while True:
  173. sql_cert_id_list = sql_pool.select_all("select cert_id from cgc_task where state = 0 limit 10000")
  174. sql_cert_id_list = [item[0] for item in sql_cert_id_list]
  175. if not sql_cert_id_list:
  176. log.info("没有需要处理的数据,等待下一轮处理....")
  177. time.sleep(3600)
  178. continue
  179. for cert_id in sql_cert_id_list:
  180. try:
  181. url = f"https://cards.cgccards.cn/certlookup/{cert_id}/"
  182. card_info = extract_card_info(log, url)
  183. if card_info and isinstance(card_info, dict):
  184. sql_pool.insert_one_or_dict(table="cgc_record", data=card_info)
  185. sql_pool.update_one_or_dict(table="cgc_task", data={"state": 1}, condition={"cert_id": cert_id})
  186. elif card_info == 3:
  187. sql_pool.update_one_or_dict(table="cgc_task", data={"state": 3}, condition={"cert_id": cert_id})
  188. elif card_info == 2:
  189. sql_pool.update_one_or_dict(table="cgc_task", data={"state": 2}, condition={"cert_id": cert_id})
  190. except Exception as e:
  191. log.error(f"Error processing card: {e}")
  192. time.sleep(10)
  193. except Exception as e:
  194. log.error(f'{inspect.currentframe().f_code.co_name} error: {e}')
  195. finally:
  196. log.info(f'爬虫程序 {inspect.currentframe().f_code.co_name} 运行结束,等待下一轮的采集任务............')
  197. if __name__ == "__main__":
  198. cgc_card_main(logger)
  199. # 6060110001