I'm trying to run a spider on my website and running a scrapyrt listen server on my desktop. It tells me my module 'webscrape' cannot be found when I run the spider and also gives me an 'Int object has no splitlines attribute.'
https://github.com/scrapy/scrapyd/issues/311 provides a solution for scrapyd. https://github.com/scrapinghub/scrapyrt/pull/84 appears to still be an issue.
So, I'm really at a loss here.
error code:
2019-08-12 16:37:47-0700 [scrapyrt] Unhandled Error
Traceback (most recent call last):
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 2196, in allContentReceived
req.requestReceived(command, path, version)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 920, in requestReceived
self.process()
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 199, in process
self.render(resrc)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 259, in render
body = resrc.render(self)
--- <exception caught here> ---
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 26, in render
result = resource.Resource.render(self, request)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\resource.py", line 250, in render
return m(request)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 127, in render_GET
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 217, in prepare_crawl
start_requests=start_requests, *args, **kwargs)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 226, in run_crawl
dfd = manager.crawl(*args, **kwargs)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\core.py", line 157, in crawl
self.get_project_settings(), self)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\core.py", line 178, in get_project_settings
return get_project_settings(custom_settings=custom_settings)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\conf\spider_settings.py", line 27, in get_project_settings
crawler_settings.setmodule(module, priority='project')
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapy\settings\__init__.py", line 288, in setmodule
module = import_module(module)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\importlib\__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 953, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 965, in _find_and_load_unlocked
builtins.ModuleNotFoundError: No module named 'webscrape'
Unhandled Error
Traceback (most recent call last):
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 2196, in allContentReceived
req.requestReceived(command, path, version)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 920, in requestReceived
self.process()
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 199, in process
self.render(resrc)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 259, in render
body = resrc.render(self)
--- <exception caught here> ---
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 26, in render
result = resource.Resource.render(self, request)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\resource.py", line 250, in render
return m(request)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 127, in render_GET
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 217, in prepare_crawl
start_requests=start_requests, *args, **kwargs)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 226, in run_crawl
dfd = manager.crawl(*args, **kwargs)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\core.py", line 157, in crawl
self.get_project_settings(), self)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\core.py", line 178, in get_project_settings
return get_project_settings(custom_settings=custom_settings)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\conf\spider_settings.py", line 27, in get_project_settings
crawler_settings.setmodule(module, priority='project')
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapy\settings\__init__.py", line 288, in setmodule
module = import_module(module)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\importlib\__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 953, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1006, in _gcd_import
File "<frozen importlib._bootstrap>", line 983, in _find_and_load
File "<frozen importlib._bootstrap>", line 965, in _find_and_load_unlocked
builtins.ModuleNotFoundError: No module named 'webscrape'
2019-08-12 16:37:47-0700 [-] Unhandled Error
Traceback (most recent call last):
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\protocols\basic.py", line 572, in dataReceived
why = self.lineReceived(line)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 2105, in lineReceived
self.allContentReceived()
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 2196, in allContentReceived
req.requestReceived(command, path, version)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 920, in requestReceived
self.process()
--- <exception caught here> ---
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 199, in process
self.render(resrc)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 259, in render
body = resrc.render(self)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 31, in render
return self.render_object(result, request)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 95, in render_object
request.setHeader('Content-Length', len(r))
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 1271, in setHeader
self.responseHeaders.setRawHeaders(name, [value])
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http_headers.py", line 220, in setRawHeaders
for v in self._encodeValues(values)]
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http_headers.py", line 220, in <listcomp>
for v in self._encodeValues(values)]
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http_headers.py", line 40, in _sanitizeLinearWhitespace
return b' '.join(headerComponent.splitlines())
builtins.AttributeError: 'int' object has no attribute 'splitlines'
Traceback (most recent call last):
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 199, in process
self.render(resrc)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\server.py", line 259, in render
body = resrc.render(self)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 31, in render
return self.render_object(result, request)
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\scrapyrt\resources.py", line 95, in render_object
request.setHeader('Content-Length', len(r))
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http.py", line 1271, in setHeader
self.responseHeaders.setRawHeaders(name, [value])
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http_headers.py", line 220, in setRawHeaders
for v in self._encodeValues(values)]
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http_headers.py", line 220, in <listcomp>
for v in self._encodeValues(values)]
File "c:\users\user\microblog\job-visualizer\venv\lib\site-packages\twisted\web\http_headers.py", line 40, in _sanitizeLinearWhitespace
return b' '.join(headerComponent.splitlines())
AttributeError: 'int' object has no attribute 'splitlines'
Project Layout:
-Job-Visualizer
-app
-webscrape(scrapyrt ran from here in venv)
-spiders
When running the spider the spider code should return the results as expected.
Edit: Spider Code:
import scrapy
from scrapy_splash import SplashRequest
class IndeedSpider(scrapy.Spider):
name = 'indeedspider'
allowed_domains = ['https://www.indeed.com']
def __init__(self):
super().__init__()
print('Spider being ran...')
self.start_url = 'https://www.indeed.com/jobs?q=financial+aid+advisor&l=Highland%2C+CA'
self.links = []
def modify_realtime_request(self, request):
return SplashRequest(url, self.parse, args=splash_args, endpoint='render.html')
def start_requests(self):
print(self.start_url)
urls = [
self.start_url
]
splash_args = {
'html': 1,
'png': 1,
'width': 800,
'render_all': 1,
}
for url in urls:
yield SplashRequest(url, self.parse, endpoint='render.json', args=splash_args)
def parse(self, response):
html = response.body
title = response.css('title').extract()
titles = response.xpath("//div[@class= 'title']/a/text()").getall()
locations = response.xpath("//div[@class= 'sjcl']/span/text()").getall()
companies = response.css("div.sjcl.span.company a::text").getall()
summarys = response.xpath("//div[@class= 'summary']/text()").getall()
Route Portion Code:
params = {
'spider_name': 'indeed_scraper',
'start_requests': True
}
response = requests.get('http://localhost:9080/crawl.json', params)
data = json.loads(response.text)
print(data)
Solution: When creating a scrapy project, make sure that scrapy.cfg is outside the SCRAPY project folder.
Incorrect:
-app
- webscrape
- scrapy.cfg
- __init__.py
- items.py
- middleware.py
- spiders
- spider.py
Correct:
-app
- scrapy.cfg
- webscrape
- __init__.py
- items.py
- middleware.py
- spiders
- spider.py
Correct Result:
{"status": "ok", "items": [], "spider_name": "indeedspider"}
Have you imported the module webscrape? Also you are using the wrong type of object so there is no splitlines attribute. If you print the object type, does it show as an int? Splitlines method only works on a string so you need to make sure the object you are calling it with is a string and not an int datatype.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.