This is my scrapy code.
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
import pymongo
import time
class CompItem(scrapy.Item):
text = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
url = scrapy.Field()
rating = scrapy.Field()
title = scrapy.Field()
category = scrapy.Field()
source = scrapy.Field()
user_info = scrapy.Field()
email = scrapy.Field()
mobile_no = scrapy.Field()
url_1 = scrapy.Field()
model_name = scrapy.Field()
class criticspider(CrawlSpider):
name = "flipkart_reviews"
allowed_domains = ["flipkart.com"]
urls = []
connection = pymongo.MongoClient("mongodb://localhost")
db = connection.electronics
db_coll = db.flipkart_url
d = []
start_urls = ['http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top']#urls
def parse_start_url(self, response):
sites = response.css('div.review-list div[review-id]')
items = []
model_name = response.xpath('//h1[@class="title"]/text()').re(r'Reviews of (.*?)$')[0].strip().encode('ascii','ignore')
for site in sites:
item = CompItem()
item['email'] = None
item['mobile_no'] = int(0)
item['category'] = None
item['title'] = site.xpath('.//div[contains(@class,"line fk-font-normal bmargin5 dark-gray")]/strong/text()').extract()[0].encode('ascii','ignore')
item['date'] = site.xpath('.//div[contains(@class, "date")]/text()').extract()[0].strip()
item['model_name'] = model_name
item['text'] = site.xpath('.//span[contains(@class,"review-text")]/text()').extract()[0]
item['rating'] = float(site.xpath('.//div[contains(@class,"fk-stars")]/@title').extract()[0].split("stars")[0])
item['name'] = ''.join(site.xpath('.//div[contains(@class, "date")]/preceding-sibling::*[1]//text()').extract()).strip()
item["url"] = response.url
item['source'] = int(3)
yield item
This works fine on the local machine without any errors .But moment I put it on AWS it starts throwing me error.
2015-10-05 12:08:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:09:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:09:34 [scrapy] DEBUG: Retrying <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top> (failed 1 times): TCP connection timed out: 110: Connection timed out.
2015-10-05 12:10:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:11:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:11:41 [scrapy] DEBUG: Retrying <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top> (failed 2 times): TCP connection timed out: 110: Connection timed out.
2015-10-05 12:12:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:13:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:13:48 [scrapy] DEBUG: Gave up retrying <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top> (failed 3 times): TCP connection timed out: 110: Connection timed out.
2015-10-05 12:13:48 [scrapy] ERROR: Error downloading <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top>: TCP connection timed out: 110: Connection timed out.
2015-10-05 12:13:48 [scrapy] INFO: Closing spider (finished)
2015-10-05 12:13:48 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 3,
'downloader/exception_type_count/twisted.internet.error.TCPTimedOutError': 3,
'downloader/request_bytes': 1119,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 10, 5, 6, 43, 48, 727700),
'log_count/DEBUG': 3,
'log_count/ERROR': 1,
'log_count/INFO': 13,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2015, 10, 5, 6, 37, 26, 877249)}
2015-10-05 12:13:48 [scrapy] INFO: Spider closed (finished)
Earlier the script was working very well.I have tried various methods - 1. Putting up DOWNLOAD_DELAY
2. Googled the same problem and read about Web scraping etiquette.
But everything in vain.
There are few checks that you can.
if above steps yields result it means we are getting response and there is problem with Our Spider way of requesting. Few things we can do wrt to Spider now.
也许您的IP被该网站阻止了
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.