import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(@class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(@class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(@class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(@class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(@class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(@class,"thumbnail may-blank")]/@href').extract()
item ["image"] = response.xpath('//a[contains(@class,"thumbnail may-blank")]/img/@src').extract()
I am able to convert this into JSON but in the output i am getting a bullet in the JSON how to remove that and still have the JSON format?
There are hidden elements that you don't see in the browser. Scrapy sees them.
You just need to search for the data inside the relevant part of the page ( div
with id="siteTable"
):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[@id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(@class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(@class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(@class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(@class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(@class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(@class,"thumbnail may-blank")]/@href').extract()
item["image"] = sel.xpath('.//a[contains(@class,"thumbnail may-blank")]/img/@src').extract()
return item
Tested, here is what I get for, for example, votes_likes
:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.