I am scraping data from Twitter for tweets, since Twitter has a limitation on this, I am scraping 2500 tweets data every 15 minutes, however, I observe that each run after 15 minutes is returning me the same tweets. Is there any way how I can skip the previously scraped tweet data using some offset. Thank You!
Here is my code:
# Import libraries
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import csv
import pandas as pd
#import re
#from textblob import TextBlob
#import string
#import preprocessor as p
#import os
import time
# Twitter credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def extract_tweets(search_words,date_since,numTweets):
return(tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets))
def scrapetweets(search_words, date_since, numTweets, numRuns):
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following', 'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags'])
#db_tweets = pd.DataFrame()
for i in range(numRuns):
tweets = extract_tweets(search_words,date_since,numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
print(len(tweet_list))
noTweets = 0
for tweet in tweet_list:
username = tweet.user.screen_name
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
lst=[]
for h in hashtags:
lst.append(h['text'])
try:
text = tweet.retweeted_status.full_text
except AttributeError: # Not a Retweet
text = tweet.full_text
itweet = [username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,lst]
db_tweets.loc[len(db_tweets)] = itweet
noTweets += 1
print(noTweets,itweet)
#filename = "tweets.csv"
#with open(filename, "a", newline='') as fp:
# wr = csv.writer(fp, dialect='excel')
# wr.writerow(itweet)
print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
if i+1 != numRuns:
time.sleep(920)
filename = "tweets.csv"
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, mode='a', index = False)
# Initialise these variables:
search_words = "#India OR #COVID-19"
date_since = "2020-04-29"
#date_until = "2020-05-01"
numTweets = 2500
numRuns = 10
# Call the function scrapetweets
program_start = time.time()
scrapetweets(search_words, date_since, numTweets, numRuns)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))
I referred to a blog on medium for this purpose.
you can add a variable as validator an store it to a file that may be a tweetid.txt
and each time you run the script, you open di tweetid.txt
if tweetid same in tweet id in txt, you pass it.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.