[英]Scrapy crawl with next page
我有這個用於scrapy框架的代碼:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(scrapy.Spider):
name = "scrapy1"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)
def parse(self, response):
site = html.fromstring(response.body_as_unicode())
titles = site.xpath('//div[@class="content"]/p[@class="row"]')
print len(titles), 'AAAA'
但問題是我得到了 100 個結果,它不會轉到下一頁。
這里有什么問題?
沒有使用您的rule
因為您沒有使用CrawlSpider
。
所以你必須像這樣手動創建下一頁requests
:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(scrapy.Spider):
name = "craiglist"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)
def parse(self, response):
site = html.fromstring(response.body_as_unicode())
titles = site.xpath('//div[@class="content"]/p[@class="row"]')
print len(titles), 'AAAA'
# follow next page links
next_page = response.xpath('.//a[@class="button next"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'http://sfbay.craigslist.org' + next_href
request = scrapy.Request(url=next_page_url)
yield request
或者像這樣使用CrawlSpider
:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(CrawlSpider):
name = "craiglist"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
site = html.fromstring(response.body_as_unicode())
titles = site.xpath('//div[@class="content"]/p[@class="row"]')
print len(titles), 'AAAA'
嘗試這個
next_page_url = response.xpath('//a[@class="button next"]').extract_first()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.