I'm trying to get data from a website that requires me to follow 2 URLs before scraping the data.
The goal is to get an exported file that looks like this:
My code is as follows:
import scrapy
from scrapy.item import Item, Field
from scrapy import Request
class myItems(Item):
info1 = Field()
info2 = Field()
info3 = Field()
info4 = Field()
class mySpider(scrapy.Spider):
name = 'techbot'
start_urls = ['']
def parse(self, response):
#Extracts first link
items = []
list1 = response.css("").extract() #extract all info from here
for i in list1:
link1 = 'https:...' + str(i)
request = Request(link1, self.parseInfo1, dont_filter =True)
request.meta['item'] = items
yield request
yield items
def parseInfo1(self, response):
#Extracts second link
item = myItems()
items = response.meta['item']
list1 = response.css("").extract()
for i in list1:
link1 = '' + str(i)
request = Request(link1, self.parseInfo2, dont_filter =True)
request.meta['item'] = items
items.append(item)
return request
def parseInfo2(self, response):
#Extracts all data
item = myItems()
items = response.meta['item']
item['info1'] = response.css("").extract()
item['info2'] = response.css("").extract()
item['info3'] = response.css("").extract()
item['info4'] = response.css("").extract()
items.append(item)
return items
I've executed the spider in the terminal with the command:
scrapy crawl techbot
The data I get is out of order, and with gaps like this:
For example it scrapes the first set of data multiple times and the rest is out of order.
If anyone could point me in the direction to get the results in a cleaner format as shown in the beginning that would be greatly appreciated.
Thanks
Solved it by consolidating the following of both links into one function instead of two. My spider is working now as follows:
class mySpider(scrapy.Spider):
name = 'techbot'
start_urls = ['']
def parse(self, response):
#Extracts links
items = []
list1 = response.css("").extract()
for i in list1:
link1 = 'https:...' + str(i)
request = Request(link2, self.parse, dont_filter =True)
request.meta['item'] = items
yield request
list2 = response.css("").extract()
for i in list2:
link2 = '' + str(i)
request = Request(link1, self.parseInfo2, dont_filter =True)
request.meta['item'] = items
yield request
yield items
def parseInfo2(self, response):
#Extracts all data
item = myItems()
items = response.meta['item']
item['info1'] = response.css("").extract()
item['info2'] = response.css("").extract()
item['info3'] = response.css("").extract()
item['info4'] = response.css("").extract()
items.append(item)
return items
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.