I've been using scrapy for over a year now with a script that someone else wrote for me. It was working great for over a year, until 6-8 weeks ago when it started giving me the following error whenever I try to download. Does anyone have any ideas?
I am running this on Ubuntu 14.04 LTS.
Command: scrapy crawl googleplay
2015-08-30 13:10:31-0400 [googleplay] ERROR: Spider error processing <GET https://accounts.google.com/ServiceLogin?continue=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&followup=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&passive=1209600&service=googleplay>
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 595, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 472, in _oneWorkUnit
result = self._iterator.next()
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 129, in extract_links
links = self._extract_links(body, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 29, in _extract_links
self.feed(response_text)
File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.7/sgmllib.py", line 174, in goahead
k = self.parse_declaration(i)
File "/usr/lib/python2.7/markupbase.py", line 98, in parse_declaration
decltype, j = self._scan_name(j, i)
File "/usr/lib/python2.7/markupbase.py", line 392, in _scan_name
% rawdata[declstartpos:declstartpos+20])
File "/usr/lib/python2.7/sgmllib.py", line 111, in error
raise SGMLParseError(message)
sgmllib.SGMLParseError: expected name token at '<!\\\\])/g,"\\\\$1").rep'
here is my GooglePlay spider (after update) along with the error message I am now receiving
import string
import requests
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scraper.items import ApkItem
from play import parse_app
class GooglePlaySpider(CrawlSpider):
name = 'googleplay'
start_urls = [
'https://play.google.com/store/apps'
]
rules = (
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/apps/category/.*', )), callback='parse_category', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/search\?.*', )), callback='parse_search', follow=True),
)
def parse_category_group(self, response):
sel = Selector(response)
category_groups = sel.xpath('//div[@class="padded-content3 app-home-nav"]')
for category_group in category_groups:
category_group_name = category_group.xpath('h2/a/text()').extract()
categories = category_group.xpath('ul/li')
for category in categories:
category_name = category.xpath('a/text()').extract()
category_url = category.xpath('a/@href').extract()[0]
chars = string.ascii_uppercase + string.digits
for x in chars:
yield Request('https://play.google.com/store/search?q=' + x + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
for z in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + z + '&c=apps', callback=self.parse_search)
return
def parse_category(self, response):
base_path = response.url.split('?')[0]
if '/collection/' in response.url:
sel = Selector(response)
apps = sel.xpath('//a[@class="title"]')
has_app = False
for app in apps:
has_app = True
app_name = app.xpath('text()').extract()
app_url = app.xpath('@href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
m = re.match(r'(.*)\?start=(\d+)&num=24', response.url)
if m is None:
start_number = 24
else:
start_number = int(m.group(2)) + 24
yield Request(base_path + '?start=' + str(start_number) + '&num=24', callback=self.parse_category)
return
def parse_search(self, response):
m = re.match(r'(.*)&start=(\d+)&num=24', response.url)
if m is None:
base_path = response.url
start_number = 24
else:
start_number = int(m.group(2)) + 24
base_path = m.group(1)
sel = Selector(response)
apps = sel.xpath('//a[contains(@href,"/store/apps/details")]')
has_app = False
for app in apps:
has_app = True
app_url = app.xpath('@href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
yield Request(base_path + '&start=' + str(start_number) + '&num=24', callback=self.parse_search)
return
**** Error ****Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 12, in <module>
class GooglePlaySpider(CrawlSpider):
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 18, in GooglePlaySpider
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
NameError: name 'SgmlLinkExtractor' is not defined
The problem is that SgmlLinkExtractor has problems with comments. And the error message tells you that there is a comment: <!
.
So the solution would be to change the spider you are using and replace the SgmlLinkExtractor with either
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
or
from scrapy.contrib.linkextractors.lxmlhtml import LxmlParserLinkExtractor
Naturally these were only the import statements, you have to change the Rule
where the link extractor is used too to use one of these extractors.
Without the code I cannot give you more advice where to change the parts.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.