简体   繁体   中英

Tweepy returns same tweets when scraping data repeatedly

I am scraping data from Twitter for tweets, since Twitter has a limitation on this, I am scraping 2500 tweets data every 15 minutes, however, I observe that each run after 15 minutes is returning me the same tweets. Is there any way how I can skip the previously scraped tweet data using some offset. Thank You!

Here is my code:

    # Import libraries
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import csv
import pandas as pd
#import re
#from textblob import TextBlob
#import string
#import preprocessor as p
#import os
import time

# Twitter credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''

# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

        
def extract_tweets(search_words,date_since,numTweets):
    return(tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets))

def scrapetweets(search_words, date_since, numTweets, numRuns):
    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following', 'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags'])
    #db_tweets = pd.DataFrame()
    
    for i in range(numRuns):
        
        tweets = extract_tweets(search_words,date_since,numTweets)
        # Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]
        print(len(tweet_list))
        noTweets = 0
        
        for tweet in tweet_list:
            username = tweet.user.screen_name
            acctdesc = tweet.user.description
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            usercreatedts = tweet.user.created_at
            tweetcreatedts = tweet.created_at
            retweetcount = tweet.retweet_count
            hashtags = tweet.entities['hashtags']
            lst=[]
            for h in hashtags:
                lst.append(h['text'])
            try:
                text = tweet.retweeted_status.full_text
            except AttributeError:  # Not a Retweet
                text = tweet.full_text
            
            itweet = [username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetcreatedts,retweetcount,text,lst]
            db_tweets.loc[len(db_tweets)] = itweet
        
            noTweets += 1
            print(noTweets,itweet)
            
            #filename = "tweets.csv"
            #with open(filename, "a", newline='') as fp:
             #   wr = csv.writer(fp, dialect='excel')
              #  wr.writerow(itweet)
                
        print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
        if i+1 != numRuns:
            time.sleep(920)
        
        filename = "tweets.csv"
        # Store dataframe in csv with creation date timestamp
        db_tweets.to_csv(filename, mode='a', index = False)
        
# Initialise these variables:

        
search_words = "#India OR #COVID-19"
date_since = "2020-04-29"
#date_until = "2020-05-01"
numTweets = 2500
numRuns = 10
# Call the function scrapetweets
program_start = time.time()
scrapetweets(search_words, date_since, numTweets, numRuns)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))

I referred to a blog on medium for this purpose.

you can add a variable as validator an store it to a file that may be a tweetid.txt

and each time you run the script, you open di tweetid.txt

if tweetid same in tweet id in txt, you pass it.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM