Jelajahi Sumber

update ins cookies 6.13.2

lei.chen 6 bulan lalu
induk
melakukan
d7abafbd4a

+ 10 - 8
ins_img_video_spider/ins_history_spider.py

@@ -5,8 +5,6 @@
 import inspect
 import random
 import time
-
-import schedule
 from loguru import logger
 from datetime import datetime
 # from tls_client import Session
@@ -17,7 +15,7 @@ from mysql_pool import MySQLConnectionPool
 
 USER_NAME_LIST = ['hobbysbestcards']  # 查询的用户名列表 hobbysbestcards
 
-cookie = r'ig_did=8D2CD910-0CBD-41CD-A5B4-9EB7E2F8BC91; ps_l=1; ps_n=1; datr=0aYZaGecXnDrIALr4HPo5O0h; mid=aBmm0QALAAFOBiNIagQ4prL9V4Zg; dpr=1.5; csrftoken=1Eeolr1d8t3VMjwNQIeMMQx9JTlyUsGu; sessionid=50762414324%3Af7LRzwBjb06Q7U%3A6%3AAYfUreTnqm7V_o3Pvqt0Tej1vwMQDGjOKw_Zm8TOqA; ds_user_id=50762414324; rur="RVA\05450762414324\0541778817145:01f75b26510d73b461bb75b0f907b2ec268507a83f95fb7c5a8571ced3b614c68af0d6b5"; wd=1707x247'
+# cookie = r'ig_did=8D2CD910-0CBD-41CD-A5B4-9EB7E2F8BC91; ps_l=1; ps_n=1; datr=0aYZaGecXnDrIALr4HPo5O0h; mid=aBmm0QALAAFOBiNIagQ4prL9V4Zg; dpr=1.5; csrftoken=1Eeolr1d8t3VMjwNQIeMMQx9JTlyUsGu; sessionid=50762414324%3Af7LRzwBjb06Q7U%3A6%3AAYfUreTnqm7V_o3Pvqt0Tej1vwMQDGjOKw_Zm8TOqA; ds_user_id=50762414324; rur="RVA\05450762414324\0541778817145:01f75b26510d73b461bb75b0f907b2ec268507a83f95fb7c5a8571ced3b614c68af0d6b5"; wd=1707x247'
 
 PARAMS = r'("app_id":\s*"[^"]+")|("claim":\s*"[^"]+")|("csrf_token":\s*"[^"]+")|(["LSD",[],{"token":\s*"[^"]+")'
 
@@ -71,11 +69,12 @@ def get_proxys(log):
 
 
 @retry(stop=stop_after_attempt(5), wait=wait_fixed(15), after=after_log)
-def ajax_request(log, url: str, params=None):
+def ajax_request(log, url: str, cookies, params=None):
     """
     请求封装
     :param log: logger对象
     :param url: api url
+    :param cookies: cookies
     :param params: api params
     :return: json object
     """
@@ -96,7 +95,7 @@ def ajax_request(log, url: str, params=None):
             'x-instagram-ajax': '1006400593',
             'sec-fetch-dest': 'empty',
             'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36',
-            'cookie': cookie
+            'cookie': cookies
         }
         resp = session.get(url, headers=headers, params=params)
         # print(resp.text)
@@ -108,12 +107,13 @@ def ajax_request(log, url: str, params=None):
         raise
 
 
-def get_userPosts(log, userName: str, sql_uid_list: list):
+def get_userPosts(log, userName: str, sql_uid_list: list, ins_cookies):
     """
     从用户名获取所有帖子
     :param log: logger对象
     :param userName:  用户名
     :param sql_uid_list: sql_uid_list 列表
+    :param ins_cookies: ins_cookies
     :return: generator
     """
     page = 1
@@ -126,7 +126,7 @@ def get_userPosts(log, userName: str, sql_uid_list: list):
         log.info(f"The page number currently requested is: {page}.........")
         # Url将在第二次请求时更改
         url = 'https://i.instagram.com/api/v1/feed/user' + f'/{temp}'
-        resp = ajax_request(log, url, params=continuation)
+        resp = ajax_request(log, url, ins_cookies, params=continuation)
 
         if not resp:
             log.error("API请求失败,跳过当前分页")
@@ -234,6 +234,8 @@ def ins_posts_main(log):
         raise RuntimeError("数据库连接池异常")
 
     try:
+        ins_cookies = sql_pool.select_one('select cookies from instagram_cookies')
+        ins_cookies = ins_cookies[0] if ins_cookies else None
         for user_name in USER_NAME_LIST:
             log.info(
                 f'-------------------------------- 开始爬取用户 {user_name} 的所有帖子 --------------------------------')
@@ -241,7 +243,7 @@ def ins_posts_main(log):
                                                (user_name,))
             sql_uid_list = [_[0] for _ in sql_uid_list]
             log.debug(f'查询到 uid 列表sql_uid_list的长度为: {len(sql_uid_list)}')
-            items_ = get_userPosts(log, user_name, sql_uid_list)
+            items_ = get_userPosts(log, user_name, sql_uid_list, ins_cookies)
             for item_ in items_:
                 # print(item_)
                 sql_pool.insert_one_or_dict('instagram_posts_record', item_)

+ 10 - 6
ins_img_video_spider/ins_posts_spider.py

@@ -17,7 +17,7 @@ from mysql_pool import MySQLConnectionPool
 
 USER_NAME_LIST = ['fanatics', 'hobbysbestcards']  # 查询的用户名列表 hobbysbestcards
 
-cookie = r'ig_did=8D2CD910-0CBD-41CD-A5B4-9EB7E2F8BC91; ps_l=1; ps_n=1; datr=0aYZaGecXnDrIALr4HPo5O0h; mid=aBmm0QALAAFOBiNIagQ4prL9V4Zg; dpr=1.5; csrftoken=1Eeolr1d8t3VMjwNQIeMMQx9JTlyUsGu; sessionid=50762414324%3Af7LRzwBjb06Q7U%3A6%3AAYfUreTnqm7V_o3Pvqt0Tej1vwMQDGjOKw_Zm8TOqA; ds_user_id=50762414324; rur="RVA\05450762414324\0541778817145:01f75b26510d73b461bb75b0f907b2ec268507a83f95fb7c5a8571ced3b614c68af0d6b5"; wd=1707x247'
+# cookie = r'ig_did=8D2CD910-0CBD-41CD-A5B4-9EB7E2F8BC91; ps_l=1; ps_n=1; datr=0aYZaGecXnDrIALr4HPo5O0h; mid=aBmm0QALAAFOBiNIagQ4prL9V4Zg; dpr=1.5; csrftoken=1Eeolr1d8t3VMjwNQIeMMQx9JTlyUsGu; sessionid=50762414324%3Af7LRzwBjb06Q7U%3A6%3AAYfUreTnqm7V_o3Pvqt0Tej1vwMQDGjOKw_Zm8TOqA; ds_user_id=50762414324; rur="RVA\05450762414324\0541778817145:01f75b26510d73b461bb75b0f907b2ec268507a83f95fb7c5a8571ced3b614c68af0d6b5"; wd=1707x247'
 
 PARAMS = r'("app_id":\s*"[^"]+")|("claim":\s*"[^"]+")|("csrf_token":\s*"[^"]+")|(["LSD",[],{"token":\s*"[^"]+")'
 
@@ -71,11 +71,12 @@ def get_proxys(log):
 
 
 @retry(stop=stop_after_attempt(5), wait=wait_fixed(15), after=after_log)
-def ajax_request(log, url: str, params=None):
+def ajax_request(log, url: str, cookies, params=None):
     """
     请求封装
     :param log: logger对象
     :param url: api url
+    :param cookies: cookies
     :param params: api params
     :return: json object
     """
@@ -96,7 +97,7 @@ def ajax_request(log, url: str, params=None):
             'x-instagram-ajax': '1006400593',
             'sec-fetch-dest': 'empty',
             'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36',
-            'cookie': cookie
+            'cookie': cookies
         }
         resp = session.get(url, headers=headers, params=params)
         # print(resp.text)
@@ -108,12 +109,13 @@ def ajax_request(log, url: str, params=None):
         raise
 
 
-def get_userPosts(log, userName: str, sql_uid_list: list):
+def get_userPosts(log, userName: str, sql_uid_list: list, ins_cookies):
     """
     从用户名获取所有帖子
     :param log: logger对象
     :param userName:  用户名
     :param sql_uid_list: sql_uid_list 列表
+    :param ins_cookies: ins_cookies
     :return: generator
     """
     page = 1
@@ -126,7 +128,7 @@ def get_userPosts(log, userName: str, sql_uid_list: list):
         log.info(f"The page number currently requested is: {page}.........")
         # Url将在第二次请求时更改
         url = 'https://i.instagram.com/api/v1/feed/user' + f'/{temp}'
-        resp = ajax_request(log, url, params=continuation)
+        resp = ajax_request(log, url, ins_cookies, params=continuation)
 
         if not resp:
             log.error("API请求失败,跳过当前分页")
@@ -234,6 +236,8 @@ def ins_posts_main(log):
         raise RuntimeError("数据库连接池异常")
 
     try:
+        ins_cookies = sql_pool.select_one('select cookies from instagram_cookies')
+        ins_cookies = ins_cookies[0] if ins_cookies else None
         for user_name in USER_NAME_LIST:
             log.info(
                 f'-------------------------------- 开始爬取用户 {user_name} 的所有帖子 --------------------------------')
@@ -241,7 +245,7 @@ def ins_posts_main(log):
                                                (user_name,))
             sql_uid_list = [_[0] for _ in sql_uid_list]
             log.debug(f'查询到 uid 列表sql_uid_list的长度为: {len(sql_uid_list)}')
-            items_ = get_userPosts(log, user_name, sql_uid_list)
+            items_ = get_userPosts(log, user_name, sql_uid_list, ins_cookies)
             for item_ in items_:
                 # print(item_)
                 sql_pool.insert_one_or_dict('instagram_posts_record', item_)