Twitter Python爬網程序的爬網機制問題

Question

以下是我的Twitter搜尋器機制的一小段代碼：

from BeautifulSoup import BeautifulSoup
import re
import urllib2

url = 'http://mobile.twitter.com/NYTimesKrugman'

def gettweets(soup):
    tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
    for tag in tags: 
        print tag.renderContents()
        print ('\n\n')

def are_more_tweets(soup):#to check whether there is more than one page on mobile   twitter 
    links = soup.findAll('a', {'href': True}, {id: 'more_link'})
    for link in links:
        b = link.renderContents()
        test_b = str(b)
        if test_b.find('more'):
            return True
        else:
            return False

def getnewlink(soup): #to get the link to go to the next page of tweets on twitter 
    links = soup.findAll('a', {'href': True}, {id : 'more_link'})
    for link in links:
        b = link.renderContents()
        if str(b) == 'more':
            c = link['href']
            d = 'http://mobile.twitter.com' +c
            return d

def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
    times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
    for time in times:
        stamp = time.renderContents()
        test_stamp = str(stamp)
        if test_stamp == '3 months ago':  
            print test_stamp
            return True
        else:
            return False


response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'stamp' + str(stamp)
print 'tweets' +str (tweets)
while (stamp is False) and (tweets is True): 
    b = getnewlink(soup)
    print b
    red = urllib2.urlopen(b)
    html = red.read()
    soup = BeautifulSoup(html)
    gettweets(soup)
    stamp = checkforstamp(soup)
    tweets = are_more_tweets(soup)
print 'done'

問題是，在我的Twitter搜尋器點擊了大約3個月的推文之后，我希望它不再轉到用戶的下一頁。 但是，它似乎沒有這樣做。 似乎一直在尋找下一頁推文。 我認為這是由於checkstamp不斷評估為False所致。 有沒有人對我如何修改代碼有任何建議，只要有更多推文（通過are_more_tweets機制驗證）並且爬蟲還未達到3個月，爬蟲便會繼續尋找下一頁推文？ ?? 謝謝！

編輯 -請參閱以下內容：

from BeautifulSoup import BeautifulSoup
import re
import urllib

url = 'http://mobile.twitter.com/cleversallie'
output = open(r'C:\Python28\testrecursion.txt', 'a') 

def gettweets(soup):
    tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
    for tag in tags: 
        a = tag.renderContents()
        b = str (a)
        print(b)
        print('\n\n')

def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter 
    links = soup.findAll('a', {'href': True}, {id: 'more_link'})
    for link in links:
        b = link.renderContents()
        test_b = str(b)
        if test_b.find('more'):
            return True
        else:
            return False

def getnewlink(soup): #to get the link to go to the next page of tweets on twitter 
    links = soup.findAll('a', {'href': True}, {id : 'more_link'})
    for link in links:
        b = link.renderContents()
        if str(b) == 'more':
            c = link['href']
            d = 'http://mobile.twitter.com' +c
            return d

 def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
    times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
    for time in times:
        stamp = time.renderContents()
        test_stamp = str(stamp)
        if not (test_stamp[0]) in '0123456789':
            continue
        if test_stamp == '3 months ago':
            print test_stamp
            return True
        else:
            return False


response = urllib.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
while (not stamp) and (tweets): 
    b = getnewlink(soup)
    print b
    red = urllib.urlopen(b)
    html = red.read()
    soup = BeautifulSoup(html)
    gettweets(soup)
    stamp = checkforstamp(soup)
    tweets = are_more_tweets(soup)
 print 'done'

Answer 1

您的soup.findall()在與您的模式匹配的鏈接（具有href屬性和class status-link ）中拾取圖像標簽。

而不是總是在第一個鏈接上return ing，請嘗試：

for time in times:
    stamp = time.renderContents()
    test_stamp = str(stamp)
    print test_stamp
    if not test_stamp[0] in '0123456789':
        continue
    if test_stamp == '3 months ago':  
        return True
    else:
        return False

如果鏈接不是以數字開頭，它將跳過該鏈接，因此您實際上可能會找到正確的鏈接。 將print聲明保留在此處，以便您查看是否打了其他鏈接，該鏈接以您也需要過濾掉的數字開頭。

編輯：你在做什么總是在第一個項目返回 times 。 我進行了更改，因此它忽略了所有不以數字開頭的鏈接。

但是，這會導致它返回None ，如果它沒有找到一個號碼的任何鏈接。 這將很好用，除非您將while not stamp and tweets更改為while stamp is False and tweets is True 。 將其更改回while not stamp and tweets ，它將正確地將None和False視為相同，並且它應該可以工作。

Twitter Python爬網程序的爬網機制問題

問題描述

1 個解決方案

解決方案1
1 已采納 2011-08-05 01:22:24

Twitter Python爬網程序的爬網機制問題

問題描述

1 個解決方案

解決方案1 1 已采納 2011-08-05 01:22:24

解決方案1
1 已采納 2011-08-05 01:22:24