[英]How to use scrapy.Request to load an element from another page into an item
我使用Scrapy創建了一個網絡刮板,它能夠從這個網站上的每個票據中搜集元素,但由於頁面上沒有,所以不能刮取票價。 當我嘗試請求下一頁來降低價格時,我無法得到錯誤:exceptions.TypeError:'XPathItemLoader'對象沒有屬性' getitem '。 我只能使用項目加載器來抓取任何元素,這就是我目前正在使用的內容,並且我不確定將另一個頁面上的已刪除元素傳遞給項目加載器的正確過程(我已經看到了一種方法來實現它項目數據類型,但它不適用於此處)。 我想我可能在將元素提取到項目對象時遇到問題,因為我正在流水線化到數據庫中,但我不確定。 如果我下面發布的代碼可以修改,以便正確爬行到下一頁,刮掉價格,並將其添加到項目加載器,我認為應該解決問題。 任何幫助將不勝感激。 謝謝!
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_price(self, response):
#First attempt at trying to load price into item loader
loader.add_xpath('ticketPrice' , '//*[@class="eventTickets lastChild"]/div/div/@data-origin-price')
print 'ticket price'
def parse(self, response):
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/td[3]/a/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader["ticketsLink"]
request = scrapy.Request(ticketsURL , callback = self.parse_price)
yield loader.load_item()
要解決的關鍵問題:
要從項加載器獲取值,請使用get_output_value()
,替換:
loader["ticketsLink"]
有:
loader.get_output_value("ticketsLink")
你需要在請求的meta
中傳遞loader
並在那里產生/返回加載的項目
在構造URL以獲取價格時,使用urljoin()
將相對部分與當前URL連接起來
這是固定版本:
from urlparse import urljoin
# other imports
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_price(self, response):
loader = response.meta['loader']
loader.add_xpath('ticketPrice' , '//*[@class="eventTickets lastChild"]/div/div/@data-origin-price')
return loader.load_item()
def parse(self, response):
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/td[3]/a/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price)
我有一個確切的問題,並在另一篇文章中解決了它。 我把我的代碼放在這里分享:(我原來的帖子在這里 )
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy import Request
import re
from datetime import datetime, timedelta
from CAPjobs.items import CAPjobsItem
from CAPjobs.items import CAPjobsItemLoader
class CAPjobSpider(Spider):
name = "naturejob3"
download_delay = 2
#allowed_domains = ["nature.com/naturejobs/"]
start_urls = [
"http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q=pathologist&where=&commit=Find+Jobs"]
def parse_subpage(self, response):
il = response.meta['il']
location = response.xpath('//div[@id="extranav"]//ul[@class="job-addresses"]/li/text()').extract()
il.add_value('loc_pj', location)
yield il.load_item()
def parse(self, response):
hxs = Selector(response)
sites = hxs.xpath('//div[@class="job-details"]')
for site in sites:
il = CAPjobsItemLoader(CAPjobsItem(), selector = site)
il.add_xpath('title', 'h3/a/text()')
il.add_xpath('post_date', 'normalize-space(ul/li[@class="when"]/text())')
il.add_xpath('web_url', 'concat("http://www.nature.com", h3/a/@href)')
url = il.get_output_value('web_url')
yield Request(url, meta={'il': il}, callback=self.parse_subpage)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.