繁体   English   中英

为什么 Scrapy 不跳转到下一页?

[英]Why won't Scrapy go to the next page?

我很困惑为什么 Scrapy 不会在以下代码中提取下一页的链接。 我相信这可能与每个链接都有一个index.php的 URL 的事实有关。 它是否不起作用,因为我必须在每个后续请求中重新提交原始Request正文和标头?

import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

all_class_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Origin': 'https://pisa.ucsc.edu',
    'Accept-Language': 'en-us',
    'Host': 'pisa.ucsc.edu',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Referer': 'https://pisa.usc.edu/class_search/',
    'Accept-Encoding': ['gzip', 'deflate', 'br'],
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
}

data = {
    'action': 'results',
    'binds[:term]': '2228',
    'binds[:reg_status]': 'all',
    'binds[:subject]': '',
    'binds[:catalog_nbr_op]': '=',
    'binds[:catalog_nbr]': '',
    'binds[:title]': '',
    'binds[:instr_name_op]': '=',
    'binds[:instructor]': '',
    'binds[:ge]': '',
    'binds[:crse_units_op]': '=',
    'binds[:crse_units_from]': '',
    'binds[:crse_units_to]': '',
    'binds[:crse_units_exact]': '',
    'binds[:days]': '',
    'binds[:times]': '',
    'binds[:acad_career]': '',
    'binds[:asynch]': 'A',
    'binds[:hybrid]': 'H',
    'binds[:synch]': 'S',
    'binds[:person]': 'P',
}

page_2_form_data_additions = {'rec_start' : '0', 'rec_dur' : '25'}

def professor_filter(item):
    return (re.search(r'\w\.', item) or "Staff" in item)

last_class_number = 0

classDict = {}

class ClassSpider(CrawlSpider):
    
    name = "classes"

    allowed_domains = ['pisa.ucsc.edu']

    start_urls = ['https://pisa.ucsc.edu/class_search/index.php']

    rules = (
        Rule(LinkExtractor(restrict_xpaths='//div[@class="row hide-print"]//a', restrict_text='next'), callback='parse_item', follow=True, cb_kwargs=data),
    )
    
    def print_link(self, response):
        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
        for row in all_rows:
            class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
            print(class_name)
            print("This activated")

    
    def start_requests(self):
        urls = ['https://pisa.ucsc.edu/class_search/index.php']

        for url in urls:
            
            yield scrapy.FormRequest(url,
                                 headers=all_class_headers,
                                 formdata=data,
                                 callback=self.parse_item)
    
    
    def parse_item(self, response):

        #page = response.url.split("/")[-2]
        
        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
        
        for row in all_rows:
            class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
            professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
            class_number = row.xpath('(.//div[@class="panel-body"]//div)[2]/a/text()').get().strip()
            time = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[2]/text()').get().strip()
            location = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[1]/text()').get().strip()
            online_or_in_person = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-3 hide-print"])[3]/b/text()').get().strip()
            classDict[class_number] = {'professor': professor, 'class_name':class_name, 'time': time, 'location': location , 'online_or_in_person': online_or_in_person}
        
        return classDict

它是否不起作用,因为我必须在每个后续请求中重新提交原始请求正文和标头?

答:是的,您可以在 devtools 中看到。

我认为scrapy.Spider更适合您想要实现的目标。

import scrapy


class ClassSpider(scrapy.Spider):
    name = "classes"
    allowed_domains = ['pisa.ucsc.edu']
    start_urls = ['https://pisa.ucsc.edu/class_search/index.php']
    custom_settings = {'DOWNLOAD_DELAY': 0.4}

    recNumber = 0
    duration = 25

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Origin': 'https://pisa.ucsc.edu',
        'Accept-Language': 'en-us',
        'Host': 'pisa.ucsc.edu',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'Referer': 'https://pisa.usc.edu/class_search/',
        'Accept-Encoding': ['gzip', 'deflate', 'br'],
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
    }

    payload = {
        'action': 'results',
        'binds[:term]': '2228',
        'binds[:reg_status]': 'all',
        'binds[:subject]': '',
        'binds[:catalog_nbr_op]': '=',
        'binds[:catalog_nbr]': '',
        'binds[:title]': '',
        'binds[:instr_name_op]': '=',
        'binds[:instructor]': '',
        'binds[:ge]': '',
        'binds[:crse_units_op]': '=',
        'binds[:crse_units_from]': '',
        'binds[:crse_units_to]': '',
        'binds[:crse_units_exact]': '',
        'binds[:days]': '',
        'binds[:times]': '',
        'binds[:acad_career]': '',
        'binds[:asynch]': 'A',
        'binds[:hybrid]': 'H',
        'binds[:synch]': 'S',
        'binds[:person]': 'P',
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.FormRequest(url=url,
                                     headers=self.headers,
                                     formdata=self.payload,
                                     callback=self.parse_item)

    def parse_item(self, response):
        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
        classDict = {}

        for row in all_rows:
            class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
            professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
            class_number = row.xpath('(.//div[@class="panel-body"]//div)[2]/a/text()').get().strip()
            time = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[2]/text()').get().strip()
            location = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[1]/text()').get().strip()
            online_or_in_person = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-3 hide-print"])[3]/b/text()').get().strip()
            classDict[class_number] = {'professor': professor, 'class_name':class_name, 'time': time, 'location': location , 'online_or_in_person': online_or_in_person}

        # if no result then break
        if not classDict:
            return

        yield classDict

        # next page
        self.payload['action'] = 'next'
        self.recNumber += self.duration
        self.payload['rec_start'] = str(self.recNumber)
        self.payload['rec_dur'] = str(self.duration)
        yield scrapy.FormRequest(url=response.url,
                                 headers=self.headers,
                                 formdata=self.payload,
                                 callback=self.parse_item)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM