簡體   English   中英

遞歸運行所有yield請求的Scrapy輸出文件-如何

[英]Scrapy output file that recursively runs all the yield requests - how to

因此,我有一個抓癢的蜘蛛,如下所示:

class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
    'http://example.com'
]

def parse(self, response):
    for subject in response.css('subject'):

        subject_name = subject.css('subject::text').extract_first().strip()
        subject_link = subject.css('subject::attr(href)').extract_first().strip()
        subject_id = subject.css('subject::attr(id)').extract_first().strip()

        if subject_link is not None:
            subject_data = scrapy.Request(subject_link, callback=self.parse_course)

        yield {
            'subject_name': subject_name,
            'subject_link': subject_link,
            'subject_id': subject_id,
            'subject_data': subject_data,
        }


def parse_course(self, response):

    subject_id = response.css('::attr(id)').extract_first().strip()

    for course in response.css('course'):

        course_name = course.css('course::text').extract_first().strip()
        course_link = course.css('course::attr(href)').extract_first().strip()
        course_id = course.css('course::attr(id)').extract_first().strip()

        if course_link is not None:
            course_data = scrapy.Request(course_link, callback=self.parse_class)

        yield {
            'course_name': course_name,
            'course_link': course_link,
            'course_id': subject_id + " " + course_id,
            'course_data': course_data,
        }

def parse_class(self, response):

    course_id = response.css('::attr(id)').extract_first().strip()

    for section in response.css('section'):
        section_name = section.css('section::text').extract_first().strip()
        section_link = section.css('section::attr(href)').extract_first().strip()

        yield {
            'section_name': section_name,
            'section_link': section_link,
            'course_id': course_id,
        }

我想獲得一個輸出json文件,它具有如下的樹形結構:

{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data": 
  {"course_id": "...", "course_link": "...", "course_name": "...", "course_data": 
    {"course_id": "...", "section_link": "...", "section_name": "..."}
  }
}

但是我只得到這個:

{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}

據我了解,這是因為yield代碼尚未執行。 我將如何調用一個完全調用所有請求的“抓取的抓取課程-ocourses.json”等效項? 如果這是不可能的,我該如何自己做呢? 我可以稍后將json導入python文件中,並以某種方式運行http://example.com/something>和以下命令嗎?

我知道有很多代碼,但是應該澄清一下。 謝謝你的幫助!

我看到兩種方法:

  1. 要么增量構建數據,然后使用Request.meta dict將數據傳遞給每個回調。 請參閱將其他數據傳遞給回調函數

要么

  1. 使用諸如scrapy-inline-requests之類的東西(待測試)

方法1

class CoursesSpider(scrapy.Spider):
    name = "courses"
    start_urls = [
        'http://example.com'
    ]

    def parse(self, response):
        for subject in response.css('subject'):

            subject_name = subject.css('subject::text').extract_first().strip()
            subject_link = subject.css('subject::attr(href)').extract_first().strip()
            subject_id = subject.css('subject::attr(id)').extract_first().strip()

            if subject_link is not None:
                subject_data = scrapy.Request(subject_link, callback=self.parse_course)

            # build a dict with the info we have so far
            subject_info = {
                'subject_name': subject_name,
                'subject_link': subject_link,
                'subject_id': subject_id,
            }
            # add this to the Request's meta dict
            subject_data.meta['subject_info'] = subject_info

            # ask Scrapy to fetch additional data
            yield subject_data

    def parse_course(self, response):

        # get back the data that was passed previously
        subject_info = response.request.meta['subject_info']

        subject_id = response.css('::attr(id)').extract_first().strip()

        for course in response.css('course'):

            course_name = course.css('course::text').extract_first().strip()
            course_link = course.css('course::attr(href)').extract_first().strip()
            course_id = course.css('course::attr(id)').extract_first().strip()

            if course_link is not None:
                course_data = scrapy.Request(course_link, callback=self.parse_class)

            # build a dict with the data in this page
            # + the data scraped previously
            course_info = {
                'course_name': course_name,
                'course_link': course_link,
                'course_id': subject_id + " " + course_id,
                'subject_info': subject_info,
            }

            # pass that data to the next callback
            course_data.meta['course_info'] = subject_info

            # fetch the class page
            yield course_data

    def parse_class(self, response):

        # get course data from previous callbacks
        course_info = response.request.meta['course_info']

        course_id = response.css('::attr(id)').extract_first().strip()

        for section in response.css('section'):
            section_name = section.css('section::text').extract_first().strip()
            section_link = section.css('section::attr(href)').extract_first().strip()

            yield {
                'section_name': section_name,
                'section_link': section_link,
                'course_id': course_id,
                'course_info': course_info
            }

因此,您將不會獲得包含課程的主題,課程本身包含部分,而是包含部分,每個部分都包含有關其所屬課程的信息,而自身也包含與課程相關的信息。

方法2。(警告:我尚未在實踐中對此進行測試,但可能會起作用)

from inline_requests import inline_requests

class CoursesSpider(scrapy.Spider):
    name = "courses"
    start_urls = [
        'http://example.com'
    ]

    # this decorator is important
    @inline_requests
    def parse(self, response):

        for subject in response.css('subject'):

            subject_name = subject.css('subject::text').extract_first().strip()
            subject_link = subject.css('subject::attr(href)').extract_first().strip()
            subject_id = subject.css('subject::attr(id)').extract_first().strip()

            # this list will collect information on courses for this subject
            subject_data = []

            if subject_link is not None:
                try:
                    # you ask scrapy to fetch the page
                    # but you do not set a callback
                    subject_response = yield scrapy.Request(subject_link)
                    # and you get a Response to work on when it's fetched,
                    # without going through a callback

                    subject_id = subject_response.css('::attr(id)').extract_first().strip()

                    for course in subject_response.css('course'):

                        course_name = course.css('course::text').extract_first().strip()
                        course_link = course.css('course::attr(href)').extract_first().strip()
                        course_id = course.css('course::attr(id)').extract_first().strip()

                        # this list will collect information on sections for this course
                        course_data = []
                        if course_link is not None:
                            try:
                                # same thing here, you ask Scrapy to fetch a Response
                                course_response = yield scrapy.Request(course_link)

                                course_id = course_response.css('::attr(id)').extract_first().strip()

                                for section in course_response.css('section'):
                                    section_name = section.css('section::text').extract_first().strip()
                                    section_link = section.css('section::attr(href)').extract_first().strip()

                                    # add each section item
                                    course_data.append(
                                        {
                                            'section_name': section_name,
                                            'section_link': section_link,
                                            'course_id': course_id,
                                        }
                                    )

                            except:
                                raise

                        # add each course item
                        subject_data.append(
                            {
                                'course_name': course_name,
                                'course_link': course_link,
                                'course_id': subject_id + " " + course_id,
                                'course_data': course_data,
                            }
                        )

                except:
                    raise


            yield {
                'subject_name': subject_name,
                'subject_link': subject_link,
                'subject_id': subject_id,
                'subject_data': subject_data,
            }

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM