I am trying to write a crawler using Scrapy/Python, that reads some values from a page.
I then want this crawler to store the highest and lowest values in seperate fields.
So far, I am able to read the values from the page (please see my code below), but I am not sure how to calculate the lowest and highest value and store in separate fields ?
For example, say the crawler reads the page and returns these values
So I want to populate ....
How do I do that ? Do I need to use an array ? Put all values in array and then pick the highest/lowest ?
Any help is very appreciated.
Here is my code so far .... I am storing -1, in case of missing values.
class MySpider(BaseSpider):
name = "courses"
start_urls = ['http://www.example.com/courses-listing']
allowed_domains = ["example.com"]
def parse(self, response):
hxs = Selector(response)
for courses in response.xpath("//meta"):
{
d = {
'courset1score': float(courses.xpath('//meta[@name="t1-score"]/@content').extract_first('').strip() or -1),
'courset2score': float(courses.xpath('//meta[@name="t2-score"]/@content').extract_first('').strip() or -1),
'courset3score': float(courses.xpath('//meta[@name="t3-score"]/@content').extract_first('').strip() or -1),
'courset4score': float(courses.xpath('//meta[@name="t4-score"]/@content').extract_first('').strip() or -1),
'courset5score': float(courses.xpath('//meta[@name="t5-score"]/@content').extract_first('').strip() or -1),
}
d['highestscore'] = max(d.values())
d['lowestscore'] = min(d.values())
'pagetitle': courses.xpath('//meta[@name="pagetitle"]/@content').extract_first(),
'pageurl': courses.xpath('//meta[@name="pageurl"]/@content').extract_first(),
}
for url in hxs.xpath('//ul[@class="scrapy"]/li/a/@href').extract():
// yield Request(response.urljoin(url), callback=self.parse)
yield d
Build the dictionary before the yield statement. This will let you reference the values already in the dictionary.
for courses in response.xpath("//meta"):
d = {'courset1score': float(courses.xpath('//meta[@name="t1-score"]/@content').extract_first('').strip() or -1),
'courset2score': float(courses.xpath('//meta[@name="t2-score"]/@content').extract_first('').strip() or -1),
'courset3score': float(courses.xpath('//meta[@name="t3-score"]/@content').extract_first('').strip() or -1),
'courset4score': float(courses.xpath('//meta[@name="t4-score"]/@content').extract_first('').strip() or -1),
'courset5score': float(courses.xpath('//meta[@name="t5-score"]/@content').extract_first('').strip() or -1),
}
d['highestscore'] = max(d.values())
d['lowestscore'] = min(d.values())
yield d
Assuming we have this html document example:
body = """
<meta name="t1-score" content="10"></meta>
<meta name="t2-score" content="20"></meta>
<meta name="t3-score" content="5"></meta>
<meta name="t4-score" content="8"></meta>
"""
sel = Selector(text=body)
We can extract scores, convert to number objects and use inbuilt min
and max
functions.
# you can use this xpath to select any score
scores = sel.xpath("//meta[re:test(@name, 't\d-score')]/@content").extract()
# ['10', '20', '5', '8']
scores = [float(score) for score in scores]
# [10.0, 20.0, 5.0, 8.0]
min(scores)
# 5.0
max(scores)
# 20.0
Combining output:
item = dict()
item['max_score'] = max(scores)
item['min_score'] = min(scores)
for i, score in enumerate(scores):
item['score{}'.format(i)] = score
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.