python - BeautifulSoup Instagram 帖子 html 抓取
问题描述
我想从特定的 Instagram 页面(例如https://www.instagram.com/p/BoFlrM7gwnK/)中抓取帖子描述。我有一部分代码可以从 Instagram 页面获取一些重新发送的帖子,它会输出很多不需要的信息,比如页面的一些脚本。
from random import choice
import json
from pprint import pprint
import requests
from bs4 import BeautifulSoup
_user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36']
class InstagramScraper:
def __init__(self, user_agents=None, proxy=None):
self.user_agents = user_agents
self.proxy = proxy
def __random_agent(self):
if self.user_agents and isinstance(self.user_agents, list):
return choice(self.user_agents)
return choice(_user_agents)
def __request_url(self, url):
try:
response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
'https': self.proxy})
response.raise_for_status()
except requests.HTTPError:
raise requests.HTTPError('Received non 200 status code from Instagram')
except requests.RequestException:
raise requests.RequestException
else:
return response.text
@staticmethod
def extract_json_data(html):
soup = BeautifulSoup(html, 'html.parser')
body = soup.find('body')
script_tag = body.find('script')
raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
return json.loads(raw_string)
def profile_page_metrics(self, profile_url):
results = {}
try:
response = self.__request_url(profile_url)
json_data = self.extract_json_data(response)
metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
except Exception as e:
raise e
else:
for key, value in metrics.items():
if key != 'edge_owner_to_timeline_media':
if value and isinstance(value, dict):
value = value['count']
results[key] = value
elif value:
results[key] = value
return results
def profile_page_recent_posts(self, profile_url):
results = []
try:
response = self.__request_url(profile_url)
json_data = self.extract_json_data(response)
metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media'][
"edges"]
except Exception as e:
raise e
else:
for node in metrics:
node = node.get('node')
if node and isinstance(node, dict):
results.append(node)
return results
k = InstagramScraper()
results=k.profile_page_recent_posts('https://www.instagram.com/selenagomez/')
pprint(results)
有没有办法从它的网址获取特定帖子的信息?任何帮助,将不胜感激。
解决方案
只是复制profile_page_recent_posts()
方法,例如
def get_single_posts(self, post_url):
results = []
response = self.__request_url(post_url)
json_data = self.extract_json_data(response)
post_text = json_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text']
post_shortcode = json_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['shortcode']
results.append({'text' : post_text, 'shortcode' : post_shortcode})
return results
输出:
{'text' : 'Mood lol....', 'shortcode' : 'BoFlrM7gwnK'}
查找要保存json_data
到文件的值并使用 JSON 查看器选择正确的键。
推荐阅读
- php - SQL查询从数据库中的一个表中获取所有帖子,但来自特定用户的最多5个条目?
- python - Module Not Found Error with one version of of Python, but no the other
- python - KivyMD | 列表项打印其“文本”on_press
- python - 更高效的 Django (SQL) 查询
- c++ - 将 std::enable_if 与外联成员函数和模板化静态成员条件一起使用
- r - 为什么权限被拒绝从 R 写入此文件?
- java - jersey 单路径多类实现(以编程方式选择一个实例)类似于条件依赖注入
- typescript - 如何创建一个部分应用另一个函数的泛型的函数
- r - 带有 R 打印代码的 org-mode 不是图形
- linux - tsch 中的算术