[英]Scrapy only going through first 5 links with next_page_url
我的代碼似乎只通過請求的前 5 個鏈接,然后在請求第 6 個鏈接時停止。 我曾嘗試使用 start_urls 和 next_page_url。 兩者都只摘自給出的前 5 頁。
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
tickers.append(response.xpath('//a[@class="screener-link-primary"]/text()').extract())
print(tickers)
next_page_url = "https://finviz.com/"
html = response.xpath(
'//a[@class="screener_arrow"]/@href').extract()[0]
print(html)
next_page_url += html
print(next_page_url)
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
任何幫助表示贊賞。
編輯:
我已經更新了代碼,但似乎仍然出現錯誤。
import scrapy
from scrapy.crawler import CrawlerProcess
import time
from bs4 import BeautifulSoup
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = [
"https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=-change"]
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class': 'screener-link-primary'})])
second_part = good_html.find('a', {'class': 'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
stock_list = finvizSpider()
process = CrawlerProcess()
process.crawl(finvizSpider)
process.start()
list2 = stock_list.returnTickers()
運行時出現以下錯誤。
看起來scrapy只能回調5次,所以我建議不要回調,我建議迭代一個包含所有鏈接的列表,你可以用BeautifulSoup來做,這會非常簡單。
pip install BeautifulSoup4
from bs4 import BeautifulSoup
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class':'screener-link-primary'})])
second_part = good_html.find('a', {'class':'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)
if next_page_url is not None:
永遠不會是None,您需要檢查html 是否為None。
當 html 為 None 時, next_page_url += html
行會給你一個錯誤,所以首先你需要檢查它是否為 None。
如果html是None,那么你不能做html[0],用extract_first替換extract(我用get)。
這是固定代碼:
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class FinvizSpider(scrapy.Spider):
name = "finviz"
urlcheck = 1
tickers = []
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
self.tickers.append(response.xpath('//a[@class="screener-link-primary"]/text()').extract())
print(self.tickers)
next_page_url = "https://finviz.com/"
html = response.xpath('//a[@class="screener_arrow"]/@href').get()
print(html)
if html is not None:
next_page_url += html
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in self.tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.