[英]How do I scrape twitter using tweepy?
我尝试使用下面的代码使用 tweepy 来抓取 Twitter。 代码中还没有错误,我找不到任何推文文件。 我尝试使用 Google Colab、Jupyter Notebook、IBM Watson Pad 和 Spider 来运行代码,但它们都在没有重定向到 Twitter 和合理的 output 的情况下执行。
#Needed Libraries
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import tweepy
import json
import pandas as pd
import csv
import re
from textblob import TextBlob
import string
import preprocessor as p
import os
import time
#Twitter credentials
#Obtain them from your twitter developer account
consumer_key = 'Consumer_Key'
consumer_secret ='Secret'
access_key = 'access key'
access_secret = 'access secret'
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#Function for Sccraping Tweet
def scrapetweets(search_words, date_since, end_date, numTweets, numRuns):
# Define a for-loop to generate tweets at regular intervals
# We cannot make large API call in one go. Hence, let's try T times
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = [ 'acctdesc', 'location', 'following',
'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts',
'retweetcount', 'text', 'hashtags']
)
program_start = time.time()
for i in range(0, numRuns):
# We will time how long it takes to scrape tweets for each run:
start_run = time.time()
# Collect tweets using the Cursor object
# Cursor() returns an object that you can iterate or loop over to access the data collected.
# Each item in the iterator has various attributes that you can access to get information about each tweet
tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, until=end_date,tweet_mode='extended').items(numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
# Obtain the following info (methods to call them out):
# user.screen_name - twitter handle
# user.description - description of account
# user.location - where is he tweeting from
# user.friends_count - no. of other users that user is following (following)
# user.followers_count - no. of other users who are following this user (followers)
# user.statuses_count - total tweets by user
# user.created_at - when the user account was created
# created_at - when the tweet was created
# retweet_count - no. of retweets
# (deprecated) user.favourites_count - probably total no. of tweets that is favourited by user
# retweeted_status.full_text - full text of the tweet
# tweet.entities['hashtags'] - hashtags in the tweet
# Begin scraping the tweets individually:
noTweets = 0
for tweet in tweet_list:
# Pull the values
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
try:
text = tweet.retweeted_status.full_text
except AttributeError: # NotaRetweet
text = tweet.full_text
# Add the 11 variables to the empty list - ith_tweet:
ith_tweet = [acctdesc, location, following, followers, totaltweets,
usercreatedts, tweetcreatedts, retweetcount, text, hashtags]
# Append to dataframe - db_tweets
db_tweets.loc[len(db_tweets)] = ith_tweet
# increase counter - noTweets
noTweets += 1
# Run ended:
end_run = time.time()
duration_run = round((end_run-start_run)/60, 2)
print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
print('time take for {} run to complete is {} mins'.format(i+1, duration_run))
time.sleep(920) #15 minute sleep time
# Once all runs have completed, save them to a single csv file:
from datetime import datetime
# Obtain timestamp in a readable format
to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
#Define working path and filename
path = "./"
filename = path + '/data/' + to_csv_timestamp + '.csv'
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, index = False)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))
# Initialise variables:
search_words = ''#Trump'
date_since = '2019-07-30'
end_date='2019-08-04'
numTweets = 1000
numRuns = 1
# Call the function scrapetweets
scrapetweets (search_words, date_since, end_date, numTweets, numRuns)
如何解决?
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.