繁体   English   中英

如何使用scrapy与爬虫模板和scrapy-splash解析javascript

[英]how to use scrapy with the crawler template and scrapy-splash to parse javascript

我正在尝试使用带有爬网模板的 scrapy 来抓取亚马逊的产品,但我发现亚马逊使用一些 javascript 来获取一些产品详细信息块,所以我决定使用 splash 来呈现 javascript,它在 shell 命令中工作正常,但是我不知道如何在我的代码中实现它。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class AmazonCrawlerSpider(CrawlSpider):
    name = 'amazon_Crawler'
    allowed_domains = ['amazon.com']
    start_urls = ['https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A502394&ref=nav_em__nav_desktop_sa_intl_camera_and_photo_0_2_5_3']

    len_product_details = LinkExtractor(restrict_css='h2 > a')
    product_details = Rule(len_product_details,
                           callback='parse_item', follow=False)

    len_products_pagination = LinkExtractor(
        restrict_xpaths='//*[@id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[37]/div/div/span/a[3]')
    products_pagination = Rule(len_products_pagination, follow=True)
    rules = (
        product_details, products_pagination
    )

    def parse_item(self, response):

        data = {

            "categorie_0": response.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul/li[1]/span/a/text()').get(),
            "categorie_1": response.xpath('//*[@id="wayfinding-breadcrumbs_feature_div"]/ul/li[3]/span/a/text()').get(),
            "title": response.css('h1 > span ::text').get(),
            "price": response.xpath('//div[@id="corePrice_feature_div"]/div/span/span[1]//text()').get(),
            "amazon_globale_shiping": response.xpath('//*[@id="a-popover-content-2"]/table/tbody/tr[2]/td[3]/span/text()').get(),
            "estimated_import_fees_deposit": response.xpath('//*[@id="a-popover-content-2"]/table/tbody/tr[3]/td[3]/span/text()').get(),
            "total": response.xpath('//*[@id="a-popover-content-2"]/table/tbody/tr[5]/td[3]/span/text()').get(),
            "delevery_period": response.xpath('//*[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()').get(),
            "delevery_destination": response.xpath('//*[@id="contextualIngressPtLabel_deliveryShortLine"]/span[2]/text()').get(),
            "in_stock": response.xpath('//*[@id="availability"]/span/text()').get(),
            "quantity": "not_exist",
            "ship_from": response.xpath('//*[@id="tabular-buybox"]/div[1]/div[2]/div/span/text()').get(),
            "sold_by": {
                "name": response.xpath('//*[@id="sellerProfileTriggerId"]/text()').get(),
                'store_url': response.xpath('//*[@id="sellerProfileTriggerId"]/@href').get(),
                'packaging': response.xpath('//*[@id="tabular-buybox"]/div[1]/div[6]/div/span/text()').get()
            },
            "description": response.xpath('//*[@id="productDescription"]/p/text()').get(),
            # "brand": response.xpath('//*[@id="productOverview_feature_div"]/div/table/tbody/tr[1]/td[2]/span/text()').get(),
            "is_returned": response.xpath('//*[@id="productSupportAndReturnPolicy-return-policy-popover-celWidget"]/div/div[1]/text()').get(),
            "extra_info": [],
            "details": [],
            "about_this_item": [],
            "note": response.xpath('//*[@id="universal-product-alert"]/div/span[2]/text()').get(),
            "Q_AW": [],
            "Customer_reviews": {
                "customer_rate": response.xpath('//*[@id="reviewsMedley"]/div/div[1]/div[2]/div[1]/div/div[2]/div/span/span/text()').get(),
                "total_rate": response.xpath('//*[@id="reviewsMedley"]/div/div[1]/div[2]/div[2]/span/text()').get(),
                "global_rate": {
                    "1_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[5]/td[3]/span[2]/a/text()').get(),
                    "2_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[4]/td[3]/span[2]/a/text()').get(),
                    "3_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[3]/td[3]/span[2]/a/text()').get(),
                    "4_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[2]/td[3]/span[2]/a/text()').get(),
                    "5_star": response.xpath('//*[@id="histogramTable"]/tbody/tr[1]/td[3]/span[2]/a/text()').get(),
                },
                "rate_by_feature": [],
                "product_reviews": []

            },
            "url": response.url

        }
        for reveiw in response.xpath('//*[@id="cm-cr-dp-review-list"]/div'):
            data["Customer_reviews"]["product_reviews"].append(
                {
                    "rate": reveiw.xpath('/div/div/div[2]/a/i/span/text()').get(),
                    "feature": reveiw.xpath('div/div/div[2]/a[2]/span/text()').get(),
                    "date_from": reveiw.xpath('div/div/span/text()').get(),
                    "verified": reveiw.xpath('div/div/div[3]/span[2]/text()').get(),
                    "review": reveiw.xpath('div/div/div[4]/span/div/div[1]/span/text()').get(),
                    'view_reaction': reveiw.xpath('div/div/div[5]/span[1]/div[1]/span/text()').get()
                }
            )

        for cr_rf in response.xpath('//*[@id="cr-summarization-attributes-list"]/div'):
            data["Customer_reviews"]["rate_by_feature"].append(
                {
                    "key": cr_rf.xpath('div/div/div/div/span/text()').get(),
                    "value": response.xpath('div/div/div[2]/span[2]/text()').get()
                }
            )

        for Q_AW in response.xpath('//*[@id="ask-btf-container"]/div/div/div[2]/span/div/div'):
            data["Q_AW"].append(
                {
                    "Question": Q_AW.xpath('div/div[2]/div/div/div[2]/a/span/text()').get(),
                    "Answer":  Q_AW.xpath('div/div[2]/div[2]/div/div[2]/span/span[2]/text()').get(),
                    "vote": Q_AW.xpath('div/div/ul/li[2]/span[1]/text()').get(),
                    "date_answer": Q_AW.xpath('div/div[2]/div[2]/div/div[2]/span[3]/text()').get()
                }
            )

        for extra_info in response.xpath('//*[@id="productDetails_detailBullets_sections1"]/tbody/tr'):
            data["extra_info"].append(
                {
                    "1": extra_info.css('th::text').get(),
                    "2": extra_info.css('td::text').get()
                }
            )
        for index, about_this_item in enumerate(response.xpath('//*[@id="feature-bullets"]/ul/li')):
            data["about_this_item"].append(
                {
                    index+1: about_this_item.xpath('span/text()').get(),

                }
            )
        for extra in response.xpath('//*[@id="productOverview_feature_div"]/div/table/tbody/tr'):
            data['details'].append(
                {
                    extra.xpath('td[1]/span/text()').get(): extra.css('td[2]/span/text()').get()
                }
            )

        yield data

我认为你在第20行有问题,你忘记定义正确的function,知道之前的循环是undefined brojola !!

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM