|
@@ -5,8 +5,6 @@
|
|
|
import inspect
|
|
import inspect
|
|
|
import random
|
|
import random
|
|
|
import time
|
|
import time
|
|
|
-
|
|
|
|
|
-import schedule
|
|
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
|
from datetime import datetime
|
|
from datetime import datetime
|
|
|
# from tls_client import Session
|
|
# from tls_client import Session
|
|
@@ -17,7 +15,7 @@ from mysql_pool import MySQLConnectionPool
|
|
|
|
|
|
|
|
USER_NAME_LIST = ['hobbysbestcards'] # 查询的用户名列表 hobbysbestcards
|
|
USER_NAME_LIST = ['hobbysbestcards'] # 查询的用户名列表 hobbysbestcards
|
|
|
|
|
|
|
|
-cookie = r'ig_did=8D2CD910-0CBD-41CD-A5B4-9EB7E2F8BC91; ps_l=1; ps_n=1; datr=0aYZaGecXnDrIALr4HPo5O0h; mid=aBmm0QALAAFOBiNIagQ4prL9V4Zg; dpr=1.5; csrftoken=1Eeolr1d8t3VMjwNQIeMMQx9JTlyUsGu; sessionid=50762414324%3Af7LRzwBjb06Q7U%3A6%3AAYfUreTnqm7V_o3Pvqt0Tej1vwMQDGjOKw_Zm8TOqA; ds_user_id=50762414324; rur="RVA\05450762414324\0541778817145:01f75b26510d73b461bb75b0f907b2ec268507a83f95fb7c5a8571ced3b614c68af0d6b5"; wd=1707x247'
|
|
|
|
|
|
|
+# cookie = r'ig_did=8D2CD910-0CBD-41CD-A5B4-9EB7E2F8BC91; ps_l=1; ps_n=1; datr=0aYZaGecXnDrIALr4HPo5O0h; mid=aBmm0QALAAFOBiNIagQ4prL9V4Zg; dpr=1.5; csrftoken=1Eeolr1d8t3VMjwNQIeMMQx9JTlyUsGu; sessionid=50762414324%3Af7LRzwBjb06Q7U%3A6%3AAYfUreTnqm7V_o3Pvqt0Tej1vwMQDGjOKw_Zm8TOqA; ds_user_id=50762414324; rur="RVA\05450762414324\0541778817145:01f75b26510d73b461bb75b0f907b2ec268507a83f95fb7c5a8571ced3b614c68af0d6b5"; wd=1707x247'
|
|
|
|
|
|
|
|
PARAMS = r'("app_id":\s*"[^"]+")|("claim":\s*"[^"]+")|("csrf_token":\s*"[^"]+")|(["LSD",[],{"token":\s*"[^"]+")'
|
|
PARAMS = r'("app_id":\s*"[^"]+")|("claim":\s*"[^"]+")|("csrf_token":\s*"[^"]+")|(["LSD",[],{"token":\s*"[^"]+")'
|
|
|
|
|
|
|
@@ -71,11 +69,12 @@ def get_proxys(log):
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry(stop=stop_after_attempt(5), wait=wait_fixed(15), after=after_log)
|
|
@retry(stop=stop_after_attempt(5), wait=wait_fixed(15), after=after_log)
|
|
|
-def ajax_request(log, url: str, params=None):
|
|
|
|
|
|
|
+def ajax_request(log, url: str, cookies, params=None):
|
|
|
"""
|
|
"""
|
|
|
请求封装
|
|
请求封装
|
|
|
:param log: logger对象
|
|
:param log: logger对象
|
|
|
:param url: api url
|
|
:param url: api url
|
|
|
|
|
+ :param cookies: cookies
|
|
|
:param params: api params
|
|
:param params: api params
|
|
|
:return: json object
|
|
:return: json object
|
|
|
"""
|
|
"""
|
|
@@ -96,7 +95,7 @@ def ajax_request(log, url: str, params=None):
|
|
|
'x-instagram-ajax': '1006400593',
|
|
'x-instagram-ajax': '1006400593',
|
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-dest': 'empty',
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36',
|
|
|
- 'cookie': cookie
|
|
|
|
|
|
|
+ 'cookie': cookies
|
|
|
}
|
|
}
|
|
|
resp = session.get(url, headers=headers, params=params)
|
|
resp = session.get(url, headers=headers, params=params)
|
|
|
# print(resp.text)
|
|
# print(resp.text)
|
|
@@ -108,12 +107,13 @@ def ajax_request(log, url: str, params=None):
|
|
|
raise
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
-def get_userPosts(log, userName: str, sql_uid_list: list):
|
|
|
|
|
|
|
+def get_userPosts(log, userName: str, sql_uid_list: list, ins_cookies):
|
|
|
"""
|
|
"""
|
|
|
从用户名获取所有帖子
|
|
从用户名获取所有帖子
|
|
|
:param log: logger对象
|
|
:param log: logger对象
|
|
|
:param userName: 用户名
|
|
:param userName: 用户名
|
|
|
:param sql_uid_list: sql_uid_list 列表
|
|
:param sql_uid_list: sql_uid_list 列表
|
|
|
|
|
+ :param ins_cookies: ins_cookies
|
|
|
:return: generator
|
|
:return: generator
|
|
|
"""
|
|
"""
|
|
|
page = 1
|
|
page = 1
|
|
@@ -126,7 +126,7 @@ def get_userPosts(log, userName: str, sql_uid_list: list):
|
|
|
log.info(f"The page number currently requested is: {page}.........")
|
|
log.info(f"The page number currently requested is: {page}.........")
|
|
|
# Url将在第二次请求时更改
|
|
# Url将在第二次请求时更改
|
|
|
url = 'https://i.instagram.com/api/v1/feed/user' + f'/{temp}'
|
|
url = 'https://i.instagram.com/api/v1/feed/user' + f'/{temp}'
|
|
|
- resp = ajax_request(log, url, params=continuation)
|
|
|
|
|
|
|
+ resp = ajax_request(log, url, ins_cookies, params=continuation)
|
|
|
|
|
|
|
|
if not resp:
|
|
if not resp:
|
|
|
log.error("API请求失败,跳过当前分页")
|
|
log.error("API请求失败,跳过当前分页")
|
|
@@ -234,6 +234,8 @@ def ins_posts_main(log):
|
|
|
raise RuntimeError("数据库连接池异常")
|
|
raise RuntimeError("数据库连接池异常")
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
|
|
+ ins_cookies = sql_pool.select_one('select cookies from instagram_cookies')
|
|
|
|
|
+ ins_cookies = ins_cookies[0] if ins_cookies else None
|
|
|
for user_name in USER_NAME_LIST:
|
|
for user_name in USER_NAME_LIST:
|
|
|
log.info(
|
|
log.info(
|
|
|
f'-------------------------------- 开始爬取用户 {user_name} 的所有帖子 --------------------------------')
|
|
f'-------------------------------- 开始爬取用户 {user_name} 的所有帖子 --------------------------------')
|
|
@@ -241,7 +243,7 @@ def ins_posts_main(log):
|
|
|
(user_name,))
|
|
(user_name,))
|
|
|
sql_uid_list = [_[0] for _ in sql_uid_list]
|
|
sql_uid_list = [_[0] for _ in sql_uid_list]
|
|
|
log.debug(f'查询到 uid 列表sql_uid_list的长度为: {len(sql_uid_list)}')
|
|
log.debug(f'查询到 uid 列表sql_uid_list的长度为: {len(sql_uid_list)}')
|
|
|
- items_ = get_userPosts(log, user_name, sql_uid_list)
|
|
|
|
|
|
|
+ items_ = get_userPosts(log, user_name, sql_uid_list, ins_cookies)
|
|
|
for item_ in items_:
|
|
for item_ in items_:
|
|
|
# print(item_)
|
|
# print(item_)
|
|
|
sql_pool.insert_one_or_dict('instagram_posts_record', item_)
|
|
sql_pool.insert_one_or_dict('instagram_posts_record', item_)
|