I am very new to web scraping , and I am trying to scrape this online forum: https://community.whattoexpect.com/forums/postpartum-depression.html
It is a two-level site where the main page is a list of discussion posts, and you can click on each post to get the full content and see the reply comments. The main site also has pagination.
I want my final CSV to look something like this:
The idea is to have the main post in one row, and then the replies in the next rows. I will be using the same ID for main post and replies, so that they can be linked.
Here is my Scrapy spider so far:
import scrapy
import datetime
class PeripartumSpider(scrapy.Spider):
name = 'peripartum'
start_urls = ['http://www.community.whattoexpect.com/forums/postpartum-depression.html']
def parse(self, response):
for post_link in response.xpath('//*[@id="group-discussions"]/div[3]/div/div/a/@href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_thread)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[@class="page-link"])[1]/@href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
# Going into each post and extracting information.
def parse_thread(self, response):
original_post = response.xpath("//*[@class='__messageContent fr-element fr-view']/p/text()").extract()
title = response.xpath("//*[@class='discussion-original-post__title']/text()").extract_first()
author_name = response.xpath("//*[@class='discussion-original-post__author__name']/text()").extract_first()
unixtime = response.xpath("//*[@class='discussion-original-post__author__updated']/@data-date").extract_first()
unixtime = int(unixtime) / 1000 # Removing milliseconds
timestamp = datetime.datetime.utcfromtimestamp(unixtime).strftime("%m/%d/%Y %H:%M")
replies_list = response.xpath("//*[@class='discussion-replies__list']").getall()
# Getting the comments and their information for each post
reply_post = response.xpath(".//*[@class='wte-reply__content__message __messageContent fr-element fr-view']/p/text()").extract()
reply_author = response.xpath("//*[@class='wte-reply__author__name']/text()").extract()
reply_time = response.xpath("//*[@class='wte-reply__author__updated']/@data-date").extract()
for reply in reply_time:
reply_date = int(reply_time) / 1000 # Removing milliseconds
reply_timestamp = datetime.datetime.utcfromtimestamp(reply_date).strftime("%m/%d/%Y %H:%M")
yield {
"title": title,
"author_name": author_name,
"time": timestamp,
"post": original_post,
"reply_author": reply_author,
"reply_timestamp": reply_timestamp,
"replies": reply_post
}
When I try to run my spider, I am getting 0 crawls. I am not sure if I am correctly following the links to each post. And, should I use something like Python's CSV library to get the comments to load into the next row but with the original post ID?
You have to take care about
May there is a better coding than the following, just identifying n
comments and after that looping over the comments. In this case you don't need to zip
the lists together. But you could use it as a starting point
import scrapy
import datetime
class PeripartumSpider(scrapy.Spider):
name = 'peripartum'
start_urls = ['https://community.whattoexpect.com/forums/postpartum-depression.html']
def parse(self, response):
for post_link in response.xpath('//*[@id="group-discussions"]/div[3]/div/div/a/@href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_thread)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[@class="page-link"])[1]/@href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
# Going into each post and extracting information.
def parse_thread(self, response):
original_post = response.xpath("//*[@class='__messageContent fr-element fr-view']/p/text()").extract()
title = response.xpath("//*[@class='discussion-original-post__title']/text()").extract_first()
author_name = response.xpath("//*[@class='discussion-original-post__author__name']/text()").extract_first()
unixtime = response.xpath("//*[@class='discussion-original-post__author__updated']/@data-date").extract_first()
unixtime = int(unixtime) / 1000 # Removing milliseconds
timestamp = datetime.datetime.utcfromtimestamp(unixtime).strftime("%m/%d/%Y %H:%M")
replies_list = response.xpath("//*[@class='discussion-replies__list']").getall()
# Getting the comments and their information for each post
replies_post = response.xpath(".//*[@class='wte-reply__content__message __messageContent fr-element fr-view']/p/text()").extract()
replies_author = response.xpath("//*[@class='wte-reply__author__name']/text()").extract()
replies_time = response.xpath("//*[@class='wte-reply__author__updated']/@data-date").extract()
replies = zip(replies_post, replies_author, replies_time)
for reply_post, reply_author, reply_time in replies:
reply_date = int(reply_time) / 1000 # Removing milliseconds
reply_timestamp = datetime.datetime.utcfromtimestamp(reply_date).strftime("%m/%d/%Y %H:%M")
yield {
"title": title,
"author_name": author_name,
"time": timestamp,
"post": original_post,
"reply_author": reply_author,
"reply_timestamp": reply_timestamp,
"replies": reply_post
}
You may also have to take care about pagination in comments.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.