python - 如何将超过 200 条下载的推文附加到数据框?
问题描述
我正在使用循环下载超过 twitter 的速率上限;但是,当我尝试附加列表时,它会返回一个空数据框。
我的功能如下:
在:
import pandas as pd
import numpy as np
import tweepy
from datetime import timedelta
def get_tweets(handle):
batch_count_for_tweet_downloads = 200
try:
alltweets = []
tweets = api_twitter.user_timeline(screen_name=handle,
count=batch_count_for_tweet_downloads,
exclude_replies=True,
include_rts=False,
lang="en",
tweet_mode="extended")
# ---GET MORE THAN 200 TWEETS
alltweets.extend(tweets)
oldest = alltweets[-1].id - 1
oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
while len(tweets) > 0:
tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
alltweets.extend(tweets)
oldest = alltweets[-1].id - 1
print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
#---
df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])
total_tweets.extend(alltweets)
print(handle + " Total Tweets Extracted: {}".format(len(alltweets)))
except:
pass
return df
如您所见,我需要一些帮助来将循环合并到函数中。
这样做的最佳方法是什么?
提前谢谢你的帮助。
编辑 1:(我的代码现在的样子)
在:
import tweepy
import pandas as pd
import numpy as np
from datetime import timedelta
handles = ['@MrML16419203', '@d00tn00t']
consumerKey = 'x'
consumerSecret = 'x'
accessToken = 'x'
accessTokenSecret = 'x'
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)
authenticate.set_access_token(accessToken, accessTokenSecret)
api_twitter = tweepy.API(authenticate, wait_on_rate_limit=True)
total_tweets = []
def get_tweets(handle):
batch_count_for_tweet_downloads = 200
try:
alltweets = []
tweets = api_twitter.user_timeline(screen_name=handle,
count=batch_count_for_tweet_downloads,
exclude_replies=True,
include_rts=False,
lang="en",
tweet_mode="extended")
alltweets.extend(tweets)
oldest = alltweets[-1].id - 1
oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
while len(tweets) > 0:
tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
alltweets.extend(tweets)
if len(alltweets) > 0:
oldest = alltweets[-1].id - 1
else:
pass
print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
print('---Total Downloaded: ' + str(len(alltweets)) + ' for ' + handle + '---')
df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])
print([tweets.favorite_count for tweets in alltweets])
print(np.array([tweets.favorite_count for tweets in alltweets]))
total_tweets.extend(alltweets)
print("----------Total Tweets Extracted: {}".format(df.shape[0]) + "----------")
except:
pass
return df
df = pd.DataFrame()
for handle in handles:
df_new = get_tweets(handle)
df = pd.concat((df, df_new))
print(df)
出去:
Getting Tweets For @MrML16419203, After: 2011-03-19 07:03:53
Count: ...136 @MrML16419203 Tweets Downloaded
---Total Downloaded: 136 for @MrML16419203---
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
----------Total Tweets Extracted: 136----------
Getting Tweets For @d00tn00t, After: 2009-11-27 19:18:58
Count: ...338 @d00tn00t Tweets Downloaded
Count: ...530 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
---Total Downloaded: 546 for @d00tn00t---
Handle Tweets Date Len Like_count RT_count
0 MrML16419203 132716 2020-09-02 02:18:28 6.0 0.0 0.0
1 MrML16419203 432881 2020-09-02 02:04:23 6.0 0.0 0.0
2 MrML16419203 973625 2020-09-02 02:04:09 6.0 0.0 0.0
3 MrML16419203 1234567 2020-09-02 01:55:10 7.0 0.0 0.0
4 MrML16419203 225865 2020-09-02 01:27:11 6.0 0.0 0.0
.. ... ... ... ... ... ...
541 d00tn00t NaN NaT NaN NaN NaN
542 d00tn00t NaN NaT NaN NaN NaN
543 d00tn00t NaN NaT NaN NaN NaN
544 d00tn00t NaN NaT NaN NaN NaN
545 d00tn00t NaN NaT NaN NaN NaN
[682 rows x 6 columns]
正如您所看到的,对于具有少于 200 条推文的句柄,数据框会被填充。但是,不适用于包含超过 200 条推文的句柄。
解决方案
对于任何偶然发现这一点的人,我让它工作:
def get_tweets(screen_name):
batch_count_for_tweet_downloads = 200
try:
alltweets = []
tweets = api_twitter.user_timeline(screen_name=screen_name,
count=batch_count_for_tweet_downloads,
exclude_replies=True,
include_rts=False,
lang="en")
alltweets.extend(tweets)
oldest = alltweets[-1].id - 1
oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
while len(tweets) > 0:
tweets = api_twitter.user_timeline(screen_name=screen_name, count=batch_count_for_tweet_downloads,
max_id=oldest)
alltweets.extend(tweets)
if len(alltweets) > 0:
oldest = alltweets[-1].id - 1
else:
pass
print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
outtweets = [
[tweet.user.screen_name, tweet.text, tweet.created_at, len(tweet.text),
tweet.favorite_count, tweet.retweet_count] for tweet in alltweets]
df_tweet_function = pd.DataFrame(outtweets,
columns=['Handle', 'Tweets', 'Date', 'Len', 'Like_count', 'RT_count'])
print('----------Total Downloaded: ' + str(len(alltweets)) + ' for ' + handle + '----------')
except tweepy.error.TweepError:
pass
return df_tweet_function
df = pd.DataFrame() if name == ' main ': 句柄中的句柄:get_tweets(handle) df = df.append(get_tweets(handle)) print("------------ ---提取的推文总数:{}".format(df.shape[0]) + "----------------")
推荐阅读
- http - 使用其 IP 地址从浏览器访问 GCE Windows Sever 2019 虚拟机
- javascript - 如何将数据参数添加到 HTML 字符串并返回字符串?
- c - 关于Producer-Consumer程序写入文件的问题
- python - 用于注册和登录的 Django 不同的 USERNAME 字段
- shell - 有没有办法在 Ubuntu WSL 中使用“Ctrl-Shift-(箭头键)”突出显示整个单词?
- apache - 如何在同一个 apache 网络服务器上运行 mod-mono 和 php
- continuous-integration - GitLab CI 登录到没有 DinD 的私人仓库
- html - 如何在 Angular 中动态更改环境变量?
- javascript - 未从 omdbapi.com 获取数据
- python - 根据一帧的行值和另一帧的列值合并两个数据帧