Scrapy Spider error processing

Question

I've been using scrapy for over a year now with a script that someone else wrote for me. It was working great for over a year, until 6-8 weeks ago when it started giving me the following error whenever I try to download. Does anyone have any ideas?

I am running this on Ubuntu 14.04 LTS.

Command: scrapy crawl googleplay

2015-08-30 13:10:31-0400 [googleplay] ERROR: Spider error processing <GET https://accounts.google.com/ServiceLogin?continue=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&followup=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&passive=1209600&service=googleplay>
    Traceback (most recent call last):
      File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 800, in runUntilCurrent
        call.func(*call.args, **call.kw)
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 595, in _tick
        taskObj._oneWorkUnit()
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 472, in _oneWorkUnit
        result = self._iterator.next()
      File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
        work = (callable(elem, *args, **named) for elem in iterable)
    --- <exception caught here> ---
      File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
        yield next(it)
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
        for x in result:
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
        return (_set_referer(r) for r in result or ())
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
        for request_or_item in self._requests_to_follow(response):
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
        links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 129, in extract_links
        links = self._extract_links(body, response.url, response.encoding, base_url)
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 29, in _extract_links
        self.feed(response_text)
      File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
        self.goahead(0)
      File "/usr/lib/python2.7/sgmllib.py", line 174, in goahead
        k = self.parse_declaration(i)
      File "/usr/lib/python2.7/markupbase.py", line 98, in parse_declaration
        decltype, j = self._scan_name(j, i)
      File "/usr/lib/python2.7/markupbase.py", line 392, in _scan_name
        % rawdata[declstartpos:declstartpos+20])
      File "/usr/lib/python2.7/sgmllib.py", line 111, in error
        raise SGMLParseError(message)
    sgmllib.SGMLParseError: expected name token at '<!\\\\])/g,"\\\\$1").rep'

here is my GooglePlay spider (after update) along with the error message I am now receiving

import string
import requests
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scraper.items import ApkItem
from play import parse_app

class GooglePlaySpider(CrawlSpider):
        name = 'googleplay'
        start_urls = [
                'https://play.google.com/store/apps'
        ]
        rules = (
                Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
                Rule(SgmlLinkExtractor(allow=('/store/apps/category/.*', )), callback='parse_category', follow=True),
                Rule(SgmlLinkExtractor(allow=('/store/search\?.*', )), callback='parse_search', follow=True),
        )

        def parse_category_group(self, response):
                sel = Selector(response)
                category_groups = sel.xpath('//div[@class="padded-content3 app-home-nav"]')

                for category_group in category_groups:

                        category_group_name = category_group.xpath('h2/a/text()').extract()

                        categories = category_group.xpath('ul/li')
                        for category in categories:
                                category_name = category.xpath('a/text()').extract()
                                category_url = category.xpath('a/@href').extract()[0]

                chars = string.ascii_uppercase + string.digits
                for x in chars:
                        yield Request('https://play.google.com/store/search?q=' + x + '&c=apps', callback=self.parse_search)

                for x in chars:
                        for y in chars:
                                yield Request('https://play.google.com/store/search?q=' + x + y + '&c=apps', callback=self.parse_search)

                for x in chars:
                        for y in chars:
                                for z in chars:
                                        yield Request('https://play.google.com/store/search?q=' + x + y + z + '&c=apps', callback=self.parse_search)        

                return

        def parse_category(self, response):
                base_path = response.url.split('?')[0]  

                if '/collection/' in response.url:
                        sel = Selector(response)
                        apps = sel.xpath('//a[@class="title"]')
                        has_app = False

                        for app in apps:
                                has_app = True
                                app_name = app.xpath('text()').extract()
                                app_url = app.xpath('@href').extract()
                                yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)

                        if has_app:
                                m = re.match(r'(.*)\?start=(\d+)&num=24', response.url)
                                if m is None:
                                        start_number = 24                  
                                else:
                                        start_number = int(m.group(2)) + 24
                                yield Request(base_path + '?start=' + str(start_number) + '&num=24', callback=self.parse_category)

                return

        def parse_search(self, response):
                m = re.match(r'(.*)&start=(\d+)&num=24', response.url)
                if m is None:
                        base_path = response.url
                        start_number = 24                  
                else:
                        start_number = int(m.group(2)) + 24
                        base_path = m.group(1)

                sel = Selector(response)
                apps = sel.xpath('//a[contains(@href,"/store/apps/details")]')
                has_app = False

                for app in apps:
                        has_app = True
                        app_url = app.xpath('@href').extract()
                        yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)

                if has_app:
                        yield Request(base_path + '&start=' + str(start_number) + '&num=24', callback=self.parse_search)

                return

**** Error  ****Traceback (most recent call last):
  File "/usr/bin/scrapy", line 4, in <module>
    execute()
  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
    _run_print_help(parser, _run_command, cmd, args, opts)
  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
    func(*a, **kw)
  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
    cmd.run(args, opts)
  File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
    crawler = self.crawler_process.create_crawler()
  File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
    self.crawlers[name] = Crawler(self.settings)
  File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
    self.spiders = spman_cls.from_crawler(self)
  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
    sm = cls.from_settings(crawler.settings)
  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
    return cls(settings.getlist('SPIDER_MODULES'))
  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
    for module in walk_modules(name):
  File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
    submod = import_module(fullpath)
  File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
    __import__(name)
  File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 12, in <module>
    class GooglePlaySpider(CrawlSpider):
  File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 18, in GooglePlaySpider
    Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
NameError: name 'SgmlLinkExtractor' is not defined

Answer 1

The problem is that SgmlLinkExtractor has problems with comments. And the error message tells you that there is a comment: <! .

So the solution would be to change the spider you are using and replace the SgmlLinkExtractor with either

from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor

or

from scrapy.contrib.linkextractors.lxmlhtml import LxmlParserLinkExtractor

Naturally these were only the import statements, you have to change the Rule where the link extractor is used too to use one of these extractors.

Without the code I cannot give you more advice where to change the parts.

Scrapy Spider error processing

Question

1 answers

solution1
0 2015-10-02 05:47:18

Scrapy Spider error processing

Question

1 answers

solution1 0 2015-10-02 05:47:18

solution1
0 2015-10-02 05:47:18