This is my test project tree:
├── test11
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── basic.py
│ ├── easy.py
├── scrapy.cfg
In the items.py
file I have:
from scrapy.item import Item, Field
class test11Item(Item):
name = Field()
price = Field()
In the easy.py
file I have:
import scrapy
import urlparse
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, Join
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from test11.items import Test11Item
class EasySpider(CrawlSpider):
name = 'easy'
allowed_domains = ['web']
start_urls = ['https://www.amazon.cn/b?ie=UTF8&node=2127529051']
rules = (
Rule(SgmlLinkExtractor(restrict_xpaths='//*[@id="pagnNextLink"]')),
Rule(SgmlLinkExtractor(restrict_xpaths='//*[contains(@class,"s-access-detail-page")]'),
callback='parse_item')
)
def parse_item(self, response):
l = ItemLoader(item = Test11Item(), response = response)
l.add_xpath('name', '//*[@id="productTitle"]/text()', MapCompose(unicode.strip))
l.add_xpath('//*[@id="priceblock_ourprice"]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+')
return l.load_item()
In the basic.py
file I have:
import scrapy
import urlparse
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, Join
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from test11.items import Test11Item
class BasicSpider(scrapy.Spider):
name = 'basic'
allowed_domains = ['web']
start_urls = ['https://www.amazon.cn/b?ie=UTF8&node=2127529051']
def parse(self, response):
l = ItemLoader(item = Test11Item(), response = response)
l.add_xpath('name', '//*[@id="productTitle"]/text()', MapCompose(unicode.strip))
l.add_xpath('//*[@id="priceblock_ourprice"]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+')
return l.load_item()
When I run the basic
spider ( scrapy crawl basic
), I get the results I want. But when I run easy
spider, scrapy crawl easy
, I got no results at all!
What am I missing here?
您只需要适当地设置allowed_domains
:
allowed_domains = ['amazon.cn']
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.