[英]scrapy a weird bug code that can't call pipeline
我在运行时编写了一个小蜘蛛,它无法调用管道。
调试一段时间后,我发现了错误代码区域。
Spider的逻辑是,我爬网第一个URL以获取cookie,然后爬网第二个URL以下载带有cookie的代码图片,然后将准备的一些数据发布到第三个URL。 如果我从图片中得到的文本错误,那么我将再次下载以重复发布第三个网址,直到获得正确的文本为止。
让我向您展示代码:
# -*- coding: gbk -*-
import scrapy
from scrapy.http import FormRequest
import json
import os
from datetime import datetime
from scrapy.selector import Selector
from teacherCourse.handlePic import handle
from teacherCourse.items import DetailProfItem
from teacherCourse.items import DetailProfCourseItem
from teacherCourse.items import containItem
class GetTeacherCourseSpider(scrapy.Spider):
name = 'TeacherCourse'
# custom_settings = {
# 'ITEM_PIPELINES': {
# 'teacherCourse.pipelines.TeacherCoursePipeline': 300,
# }
# }
def __init__(self, selXNXQ='', titleCode=''):
self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' # first
self.vcodeUrl = 'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second
self.postUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third
self.findSessionId = None # to save the cookies
self.XNXQ = selXNXQ
self.titleCode = titleCode
def start_requests(self):
request = scrapy.Request(self.getUrl,
callback = self.downloadPic)
yield request
def downloadPic(self, response):
# download the picture
# find the session id
self.findSessionId = response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=")
request = scrapy.Request(self.vcodeUrl,
cookies= {self.findSessionId[0]: self.findSessionId[1]},
callback = self.getAndHandleYzm)
yield request
def getAndHandleYzm(self, response):
yzm = handle(response.body)
yield FormRequest(self.postUrl,
formdata={'Sel_XNXQ': '20151',
'sel_zc': '011',
'txt_yzm': yzm,
'type': '2'},
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0] + '=' + self.findSessionId[1],
},
callback=self.parse)
def parse(self, response):
body = response.body.decode('gbk')
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm)
else:
# parse data
self.parseData(body)
# item = containItem()
# item['first'] = len(body)
# return item
# the parse data part is a little bit long, but it doesn't matter.
# At the last line, I did yield a item
def parseData(self, body):
# parse body data
sel = Selector(text=body)
# get all the note text data
noteTables = sel.xpath('//table[@style="border:0px;"]').extract()
noteList = [] # to store all the note text
for noteTable in noteTables:
if '<b>' in noteTable:
sele = Selector(text = noteTable)
note = (sele.xpath('//table/tr/td/b/text()').extract())
noteText = (sele.xpath('//table/tr/td/text()').extract())
# combine note and noteText
if not noteText:
noteText.append('')
noteText.append('')
else:
if len(noteText) == 1:
noteText.append('')
noteList.append(noteText)
# get all the course data
courseTables = sel.xpath('//table[@class="page_table"]/tbody').extract()
AllDetailCourse = [] # all the teachers' course
for table in courseTables:
everyTeacherC = [] # every teacher's course
s = Selector(text = table)
trs = s.xpath('//tr').extract()
for tr in trs:
sel = Selector(text = tr)
snum = (sel.xpath('//td[1]/text()').extract())
course = (sel.xpath('//td[2]/text()').extract())
credit = (sel.xpath('//td[3]/text()').extract())
teachWay = (sel.xpath('//td[4]/text()').extract())
courseType = (sel.xpath('//td[5]/text()').extract())
classNum = (sel.xpath('//td[6]/text()').extract())
className = (sel.xpath('//td[7]/text()').extract())
stuNum = (sel.xpath('//td[8]/text()').extract())
week = (sel.xpath('//td[9]/text()').extract())
section = (sel.xpath('//td[10]/text()').extract())
location = (sel.xpath('//td[11]/text()').extract())
tmpList = []
tmpList.append(snum)
tmpList.append(course)
tmpList.append(credit)
tmpList.append(teachWay)
tmpList.append(courseType)
tmpList.append(classNum)
tmpList.append(className)
tmpList.append(stuNum)
tmpList.append(week)
tmpList.append(section)
tmpList.append(location)
# to know whether every variable is empty
detailCourse = []
for each in tmpList:
if not each:
each = ''
else:
each = each[0]
detailCourse.append(each)
everyTeacherC.append(detailCourse)
AllDetailCourse.append(everyTeacherC)
# get department, teacher, gender and title
sel = Selector(text = body)
temp1 = sel.xpath('//*[@group="group"]/table/tr/td/text()').extract()
# fill two tables, which will store in the database
i = 0
# every professor
for each in temp1:
tables = containItem() # all the data in every for loop to send to the pipeline
each = each.replace(u'\xa0', u' ')
each = each.split(' ')
depart = each[0].split('£º')
teacher = each[1].split('£º')
gender = each[2].split('£º')
title = each[3].split('£º')
# first table
profItem = DetailProfItem()
profItem['XNXQ'] = self.XNXQ
profItem['department'] = depart[1] # department
profItem['teacher'] = teacher[1] # teacher
profItem['gender'] = gender[1]
profItem['title'] = title[1]
profItem['note1'] = noteList[i][0]
profItem['note2'] = noteList[i][1]
tables['first'] = profItem # add the first table
# second table
# every professor's courses
profCourses = []
for j in range(len(AllDetailCourse[i])): # how many course for every professor
profCourseItem = DetailProfCourseItem() # every course for every professor
profCourseItem['snum'] = AllDetailCourse[i][j][0] # i means i-th professor, j means j-th course, third num means what position of the course
profCourseItem['course'] = AllDetailCourse[i][j][1]
profCourseItem['credit'] = AllDetailCourse[i][j][2]
profCourseItem['teachWay'] = AllDetailCourse[i][j][3]
profCourseItem['courseType'] = AllDetailCourse[i][j][4]
profCourseItem['classNum'] = AllDetailCourse[i][j][5]
profCourseItem['className'] = AllDetailCourse[i][j][6]
profCourseItem['stuNum'] = AllDetailCourse[i][j][7]
profCourseItem['week'] = AllDetailCourse[i][j][8]
profCourseItem['section'] = AllDetailCourse[i][j][9]
profCourseItem['location'] = AllDetailCourse[i][j][10]
profCourses.append(profCourseItem) # every professor's courses
tables['second'] = profCourseItem # add the second table
i += 1
yield tables
任何建议,将不胜感激!
settings.py :(管道部分)
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'teacherCourse.pipelines.TeacherCoursePipeline': 300,
}
items.py :(我认为不重要)
# detail professor course message
class DetailProfCourseItem(scrapy.Item):
snum = scrapy.Field() # serial number
course = scrapy.Field()
credit = scrapy.Field()
teachWay = scrapy.Field()
courseType = scrapy.Field()
classNum = scrapy.Field()
className = scrapy.Field()
stuNum = scrapy.Field()
week = scrapy.Field()
section = scrapy.Field()
location = scrapy.Field()
# the third item which contain first and second item
class containItem(scrapy.Item):
first = scrapy.Field() # for fist table
second = scrapy.Field() # for second table
管道代码:
class TeacherCoursePipeline(object):
def process_item(self, item, spider):
print('I am called!!!!!')
print(item)
return item
当我运行蜘蛛scrapy crawl TeacherCourse
它输出:
2016-07-19 17:39:18 [scrapy] INFO: Scrapy 1.1.0rc1 started (bot: teacherCourse)
2016-07-19 17:39:18 [scrapy] INFO: Overridden settings: {'BOT_NAME': 'teacherCourse', 'NEWSPIDER_MODULE': 'teacherCourse.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['teacherCourse.spiders']}
2016-07-19 17:39:18 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.logstats.LogStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-07-19 17:39:18 [scrapy] INFO: Enabled item pipelines:
['teacherCourse.pipelines.TeacherCoursePipeline']
2016-07-19 17:39:18 [scrapy] INFO: Spider opened
2016-07-19 17:39:18 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (404) <GET http://jwxt.dgut.edu.cn/robots.txt> (referer: None)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx> (referer: None)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <POST http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] INFO: Closing spider (finished)
2016-07-19 17:39:19 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1330,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 3,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 230886,
'downloader/response_count': 4,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 19, 9, 39, 19, 861620),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 2,
'response_received_count': 4,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2016, 7, 19, 9, 39, 18, 774293)}
2016-07-19 17:39:19 [scrapy] INFO: Spider closed (finished)
问题似乎在于parse
方法仅产生scrapy.Request
对象, scrapy.Item
产生scrapy.Item
实例。
else:
分支调用生成器parseData(body)
但不使用它可以生成的数据(即containItem
对象)。
解决此问题的一种方法是循环生成器结果,并一一生成它们:
def parse(self, response):
body = response.body.decode('gbk')
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm)
else:
# parse data
for i in self.parseData(body):
yield i
# item = containItem()
# item['first'] = len(body)
# return item
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.