[英]Scrapy not scraping
我正在尝试从网站上抓取新闻,但是我创建的蜘蛛在抓取时没有抓取任何内容,并且我在日志中收到了以下信息:信息:抓取了 0 页(以 0 页/分钟的速度),抓取了 0 个项目(在0 件/分钟) 。
下面是我的代码:
import scrapy
from ..items import AoscraperItem
items = AoscraperItem()
class AoSpider(scrapy.Spider):
name = "ao_spider"
def start_requests(self):
yield scrapy.Request(url="https://mothership.sg/", callback=self.parse)
def parse(self, response, **kwargs):
article_links = response.xpath("//div[@class='ind-article']/a/@href")
article_links_ext = article_links.extract()
for url in article_links_ext:
yield response.follow(url=url, callback=self.parse_article)
def parse_article(self, response):
title = response.xpath("//h1/text()").get()
# author_date = response.xpath("//div[@class='article-info ao-link-news']/span")
author = response.xpath("//span[@class='author-name']/text()").get()
date = response.xpath("//span[@class='publish-date']/text()").get()
items["title"] = title
items["author"] = author
items["date"] = date
yield items
我无法弄清楚为什么它不会在网站上抓取任何内容。
如果有人可以提供帮助,真的很感激。
在parse
函数中提取链接时,您的XPath
不正确。 它应该是article_links = response.xpath("//div[contains(@class,'ind-article')]/a/@href")
或者您可以使用以下修改后的代码。
代码
import scrapy
from ..items import AoscraperItem
items = AoscraperItem()
class AoSpider(scrapy.Spider):
name = "ao_spider"
def start_requests(self):
yield scrapy.Request(url="https://mothership.sg/", callback=self.parse)
def parse(self, response, **kwargs):
article_links = response.xpath("//div[contains(@class,'ind-article')]/a/@href")
article_links_ext = article_links.extract()
for url in article_links_ext:
yield response.follow(url=url, callback=self.parse_article,dont_filter=True)
def parse_article(self, response):
title = response.xpath("//h1/text()").get()
# author_date = response.xpath("//div[@class='article-info ao-link-news']/span")
author = response.xpath("//span[@class='author-name']/text()").get()
date = response.xpath("//span[@class='publish-date']/text()").get()
items["title"] = title
items["author"] = author
items["date"] = date
yield items
希望,它会正常工作。
import scrapy
from ..items import AoscraperItem
items = AoscraperItem()
class AoSpider(scrapy.Spider):
name = "ao_spider"
def start_requests(self):
yield scrapy.Request(url="https://mothership.sg/", callback=self.parse)
def parse(self, response, **kwargs):
article_links = response.xpath('//*[@id="latest-news"]/div/a/@href')
article_links_ext = article_links.extract()
for url in article_links_ext:
yield response.follow(url=url, callback=self.parse_article,dont_filter=True)
def parse_article(self, response):
title = response.xpath("//h1/text()").get()
# author_date = response.xpath("//div[@class='article-info ao-link-news']/span")
author = response.xpath("//span[@class='author-name']/text()").get()
date = response.xpath('(//*[@class="publish-date"]/text())[2]').get()
items["title"] = title
items["author"] = author
items["date"] = date
yield items
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.