I made a web spider that scrapes all links in a website using Scrapy. I would like to be able to add all links scraped to a list. However, for every link scraped, it creates its own list. This is my code:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
import sys
process = CrawlerProcess()
class Crawler(CrawlSpider):
name = "LinkCrawler"
start_urls = ['https://books.toscrape.com/']
allowed_domains = ['books.toscrape.com']
rules = [Rule(LinkExtractor('/catalogue/'), callback='parse_links', follow=True)]
def parse_links(self, response):
list = []
href = response.xpath('//a/@href').getall()
for link in href:
list.append(link)
print(list)
print('')
def file():
original_stdout = sys.stdout
with open('file.txt', 'w') as f:
sys.stdout = f
process.crawl(Crawler)
process.start()
sys.stdout = original_stdout
f.close()
file()
This is a sample outcome:
['../../../index.html', '../../../index.html', 'index.html', '../books/travel_2/index.html', '../books/mystery_3/index.html']
['../../../1000-places-to-see-before-you-die_1/index.html', '../../../1000-places-to-see-before-you-die_1/index.html']
['../../../old-records-never-die-one-mans-quest-for-his-vinyl-and-his-past_39/index.html', '../../../forever-rockers-the-rocker-12_19/index.html', '../../../forever-rockers-the-rocker-12_19/index.html']
However, I would like it to be:
['../../../index.html', '../../../index.html', 'index.html', '../books/travel_2/index.html', '../books/mystery_3/index.html', '../../../1000-places-to-see-before-you-die_1/index.html', '../../../1000-places-to-see-before-you-die_1/index.html', '../../../old-records-never-die-one-mans-quest-for-his-vinyl-and-his-past_39/index.html', '../../../forever-rockers-the-rocker-12_19/index.html', '../../../forever-rockers-the-rocker-12_19/index.html']
To fix this I found that you can simply create a global variable and print it.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
list = []
class Crawler(CrawlSpider):
name = "LinkCrawler"
start_urls = ['https://books.toscrape.com/']
allowed_domains = ['books.toscrape.com']
rules = [Rule(LinkExtractor('/catalogue/'), callback='parse_links', follow=True)]
def parse_links(self, response):
href = response.xpath('//a/@href').getall()
for link in href:
list.append(link)
print(list)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.