I am trying to parse a public forum that contains multiple threads. I need to store metadata of that thread. These metadata appear before getting inside the thread ie in the page which displays the list of discussion threads.
In my scrapy code below, I need to access values from parse()
method in parse_contents()
method. I am storing those values in class variables but the parse_contents()
picks up the first value that was assigned the very first time although the new value has been assigned before calling parse_contents()
.
Here is my spider class
import scrapy
import re
import pandas as pd
import time
from functools import reduce
from ..items import PostsItem
class SpiderSpider(scrapy.Spider):
name = 'posts'
page_count = 1
forum_count = 0
#Create an item container to store all this data
post_item = PostsItem()
# I want these variables to parse_contents() method
post_subject_last_message_date = ""
total_posts = 0
start_urls = [
# 'https://www.dcurbanmom.com/jforum/posts/list/150/946237.page'
'https://www.dcurbanmom.com/jforum/forums/show/32.page'
]
# Grabs the list of threads in the DCPS forum
def parse(self, response):
for next_forum in response.xpath('//span[@class="topictitle"]'):
next_forum_link = next_forum.xpath('.//a/@href')
next_forum_url = response.urljoin(next_forum_link.extract_first())
last_message = next_forum.xpath('.//ancestor::td[1]/following-sibling::td[4]/span/text()')
self.post_subject_last_message_date = last_message.get() #This needs to be picked up by parse_contents
yield scrapy.Request(url = next_forum_url, callback=self.parse_contents)
#Get next page of duscussion threads list
#Some code here
#Parses individual discussion thread
def parse_contents(self, response):
all_posts = response.xpath('//table[@class="forumline"]//tr')
post_text = ""
for post in all_posts:
post_text_response = post.xpath(".//div[@class='postbody']/br/following-sibling::text()[1] | .//div[@class='postbody']/br/following-sibling::a[1]/text() | .//div[@class='postbody']/text() | .//div[@class='postbody']/a/text()")
if(len(post_text_response.getall())>0):
post_text = "".join(re.sub('\r','',x) for x in post_text_response.getall()).strip()
#Populate the item container
if(bool(re.search(r'^\s*$', post_text))==False):
self.post_item['post_message'] = post_text
# !!! This is not picking up the value updated in the parse method !!!
self.post_item['post_subject_last_message_date'] = self.post_subject_last_message_date
post_text = ""
yield(self.post_item)
# Go to next page in this discussion thread
# Some code here
How can I fix this?
Edit: removed some lines of code to make it easier to read
replacing yield scrapy.Request(url = next_forum_url, callback=self.parse_contents)
with the following fixed it for me
yield scrapy.Request(url = next_forum_url, callback=self.parse_contents, cb_kwargs = {
'post_subject_answers': post_subject_answer,
'post_subject_first_post_date':post_subject_first_post_date,
'post_subject_views':post_subject_views,
'post_subject_last_message_date':post_subject_last_message_date
})
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.