簡體   English   中英

Scrapy:每項一行

[英]Scrapy: one row per item

我正在使用scrapy來抓取architonic * com產品的類別頁面。 但是,我想在csv中每行顯示一個產品。 在當前情況下,給定類別頁面上的所有品牌名稱都列在“品牌”下,而我希望得到這樣的輸出:

    {'brand': [u'Elisabeth Ellefsen'],
     'title': [u'Up chair I 907'],
     'img_url': [u'http://image.architonic.com/img_pro1-1/117/4373/t-up-06f-sq.jpg'],
     'link': [u'http://www.architonic.com/pmsht/up-chair-tonon/1174373']
    }

我嘗試使用項目加載程序(添加了default_output_processor = TakeFirst()),添加了“產量項目”(請參閱​​注釋代碼),並搜索了兩天以找到沒有運氣的解決方案。 希望有人願意幫助我。 任何幫助都非常感謝。

我的輸出看起來像這樣:

2013-01-14 11:53:23+0100 [archi] DEBUG: Scraped from <200 http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/3>
{'brand':   [u'Softline',
             u'Elisabeth Ellefsen',
             u'Sellex',
             u'Lievore Altherr Molina',
             u'Poliform',
             .....
             u'Hans Thyge & Co.'],
 'img_url': [u'http://image.architonic.com/img_pro1-1/117/3661/terra-h-sq.jpg',
             u'http://image.architonic.com/img_pro1-1/117/0852/fly-01-sq.jpg',
             u'http://image.architonic.com/img_pro1-1/116/9870/ley-0004-sq.jpg',
             u'http://image.architonic.com/img_pro1-1/117/1023/arflex-hollywood-03-sq.jpg',
             ...
             u'http://image.architonic.com/img_pro1-1/118/5357/reef-002-sq.jpg'],
 'link':    [u'http://www.architonic.com/pmsht/terra-softline/1173661',
             u'http://www.architonic.com/pmsht/fly-sellex/1170852',
             u'http://www.architonic.com/pmsht/ley-poliform/1169870',
             .....
             u'http://www.architonic.com/pmsht/reef-collection-labofa/1185357'],
 'title':   [u'Terra',
             u'Fly',
             u'Ley chair',
                 .....
             u'Hollywood Sofa',
             u'Pouff Round']}

我在spider / archi_spider.py中使用它

import string
import re

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector                   
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.utils.markup import remove_entities 
from archiscraper.items import ArchiItemFields, ArchiLoader

class ArchiScraper(BaseSpider):
    name = "archi"
    allowed_domains = ["architonic.com"]
    start_urls = ['http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/%s' % page for page in xrange(1, 4)]    
    # rules = (Rule(SgmlLinkExtractor(allow=('.', ),restrict_xpaths=('//*[@id="right_arrow"]',))
    #       , callback="parse_items", follow= True),
    #       )
    #            
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//li[contains(@class, "nav_pro_item")]')
        items = []
        for site in sites:
            item = ArchiLoader(ArchiItemFields(), site)
            item.add_xpath('brand',       '//*[contains(@class, "nav_pro_text")]/a/br/following-sibling::node()[1][self::text()]')
            item.add_xpath('designer',       '//*[contains(@class, "nav_pro_text")]/a/br/following-sibling::node()[3][self::text()]')
            item.add_xpath('title',       '//*[contains(@class, "nav_pro_text")]/a/strong/text()')                   
            item.add_xpath('img_url',   '//li[contains(@class, "nav_pro_item")]/div/a/img/@src[1]')                     
            item.add_xpath('link',    '//*[contains(@class, "nav_pro_text")]/a/@href')
            items.append(item.load_item())      
            return items
            # for item in items:
                # yield item  

items.py

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
import string
from scrapy.item import Item, Field   
from scrapy.contrib.loader.processor import MapCompose, Join, TakeFirst
from scrapy.utils.markup import remove_entities 
from scrapy.contrib.loader import XPathItemLoader  

class ArchiItem():
    pass

class ArchiItemFields(Item):
    brand = Field()
    title = Field()
    designer = Field()
    img_url = Field()
    img = Field()
    link = Field() 
    pass

class ArchiLoader(XPathItemLoader):
    # default_input_processor = MapCompose(unicode.strip)
    # default_output_processor= TakeFirst()  

    brand_out = MapCompose(unicode.strip)
    # title_out = Join()    

只需在結束后返回旅游項目列表,即

for site in sites:
            item = ArchiLoader(ArchiItemFields(), site)
            item.add_xpath('brand',       '//*[contains(@class, "nav_pro_text")]/a/br/following-sibling::node()[1][self::text()]')
            item.add_xpath('designer',       '//*[contains(@class, "nav_pro_text")]/a/br/following-sibling::node()[3][self::text()]')
            item.add_xpath('title',       '//*[contains(@class, "nav_pro_text")]/a/strong/text()')                   
            item.add_xpath('img_url',   '//li[contains(@class, "nav_pro_item")]/div/a/img/@src[1]')                     
            item.add_xpath('link',    '//*[contains(@class, "nav_pro_text")]/a/@href')
            items.append(item.load_item())      
return items

希望它會有所幫助:)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM