刮不刮

Question

我正在尝试从网站上抓取新闻，但是我创建的蜘蛛在抓取时没有抓取任何内容，并且我在日志中收到了以下信息：信息：抓取了 0 页（以 0 页/分钟的速度），抓取了 0 个项目（在0 件/分钟） 。

下面是我的代码：

import scrapy
from ..items import AoscraperItem

items = AoscraperItem()


class AoSpider(scrapy.Spider):
    name = "ao_spider"

    def start_requests(self):
        yield scrapy.Request(url="https://mothership.sg/", callback=self.parse)

    def parse(self, response, **kwargs):
        article_links = response.xpath("//div[@class='ind-article']/a/@href")
        article_links_ext = article_links.extract()

        for url in article_links_ext:
            yield response.follow(url=url, callback=self.parse_article)

    def parse_article(self, response):
        title = response.xpath("//h1/text()").get()
        # author_date = response.xpath("//div[@class='article-info ao-link-news']/span")
        author = response.xpath("//span[@class='author-name']/text()").get()
        date = response.xpath("//span[@class='publish-date']/text()").get()

        items["title"] = title
        items["author"] = author
        items["date"] = date

        yield items

我无法弄清楚为什么它不会在网站上抓取任何内容。

如果有人可以提供帮助，真的很感激。

Answer 1

在parse函数中提取链接时，您的XPath不正确。 它应该是article_links = response.xpath("//div[contains(@class,'ind-article')]/a/@href")或者您可以使用以下修改后的代码。

代码

import scrapy
from ..items import AoscraperItem

items = AoscraperItem()


class AoSpider(scrapy.Spider):
    name = "ao_spider"

    def start_requests(self):
        yield scrapy.Request(url="https://mothership.sg/", callback=self.parse)

    def parse(self, response, **kwargs):
        article_links = response.xpath("//div[contains(@class,'ind-article')]/a/@href")
        article_links_ext = article_links.extract()

        for url in article_links_ext:
                yield response.follow(url=url, callback=self.parse_article,dont_filter=True)

    def parse_article(self, response):
        title = response.xpath("//h1/text()").get()
        # author_date = response.xpath("//div[@class='article-info ao-link-news']/span")
        author = response.xpath("//span[@class='author-name']/text()").get()
        date = response.xpath("//span[@class='publish-date']/text()").get()


        items["title"] = title
        items["author"] = author
        items["date"] = date

        yield items

Answer 2

希望，它会正常工作。

import scrapy
from ..items import AoscraperItem

items = AoscraperItem()


class AoSpider(scrapy.Spider):
    name = "ao_spider"

    def start_requests(self):
        yield scrapy.Request(url="https://mothership.sg/", callback=self.parse)

    def parse(self, response, **kwargs):
        article_links = response.xpath('//*[@id="latest-news"]/div/a/@href')
        article_links_ext = article_links.extract()

        for url in article_links_ext:
                yield response.follow(url=url, callback=self.parse_article,dont_filter=True)

    def parse_article(self, response):
        title = response.xpath("//h1/text()").get()
        # author_date = response.xpath("//div[@class='article-info ao-link-news']/span")
        author = response.xpath("//span[@class='author-name']/text()").get()
        date = response.xpath('(//*[@class="publish-date"]/text())[2]').get()


        items["title"] = title
        items["author"] = author
        items["date"] = date

        yield items

刮不刮

问题描述

2 个解决方案

解决方案1
0 2021-07-09 08:34:16

解决方案2
0 2021-07-09 09:26:22

刮不刮

问题描述

2 个解决方案

解决方案1 0 2021-07-09 08:34:16

解决方案2 0 2021-07-09 09:26:22

解决方案1
0 2021-07-09 08:34:16

解决方案2
0 2021-07-09 09:26:22