I have a list in spider class. I need to initialize it. This is what code looks like:
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = [
'https://www.arabam.com/sitemap/otomobil_13.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI': "arabam_"+str(datetime.today().strftime('%d%m%y'))+'.csv'
}
crawled = []
new_links = 0
def parse(self,response):
if self.new_links >3:
with open("URLs", "wb") as f:
pickle.dump(self.crawled, f)
self.new_links = 0
for td in response.xpath("/html/body/div[3]/div[6]/div[4]/div/div[2]/table/tbody/tr/td[4]/div/a"):
if link[0] not in self.crawled:
self.crawled.append(link[0])
#################################some code
Traceback (most recent call last):
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred
result = f(*args, **kw)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 262, in item_scraped
slot = self.slot
AttributeError: 'FeedExporter' object has no attribute 'slot'
It keeps throwing following exception:
Traceback (most recent call last):
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred
result = f(*args, **kw)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 232, in open_spider
uri = self.urifmt % self._get_uri_params(spider)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 313, in _get_uri_params
params[k] = getattr(spider, k)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\spiders\__init__.py", line 36, in logger
logger = logging.getLogger(self.name)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1845, in getLogger
return Logger.manager.getLogger(name)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1174, in getLogger
raise TypeError('A logger name must be a string')
TypeError: A logger name must be a string
According to some resource it is because of this:
Traceback (most recent call last): File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred result = f(*args, **kw) File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply return receiver(*arguments, **named) File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 232, in open_spider uri = self.urifmt % self._get_uri_params(spider) File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 313, in _get_uri_params params[k] = getattr(spider, k) File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\spiders\__init__.py", line 36, in logger logger = logging.getLogger(self.name) File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1845, in getLogger return Logger.manager.getLogger(name) File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1174, in getLogger raise TypeError('A logger name must be a string') TypeError: A logger name must be a string
How do I pass it the list or there is any way that this list canbe initialized only once with scrapy spider? List contains all the urls that have been crawled. This list is pickled. When the code starts, it initializes this list and crawls further only if the link is not present in this list.
You need to pass the list of urls using the spider attribute name (which is crawled
) in your case.
According to the docs , if you don't override the __init__
method of the spider, all the passed arguments to the spider class are mapped to the spider attributes. So in order to override the crawled
attribute, you need to send the extact argument name.
Something like this:
process = CrawlerProcess()
crawled_urls = []
try:
with (open("URLs", "rb")) as openfile:
while True:
try:
crawled_urls = pickle.load(openfile)
except EOFError:
break
except:
with open("URLs", "wb") as f:
pickle.dump("", f)
print(crawled_urls)
process.crawl(Myspider, crawled=crawled_urls)
process.start() # the script wi
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.