Here is the simple structure of my web scrapy.
import scrapy,urllib.request
class TestSpider(scrapy.Spider):
def __init__(self, *args, **kw):
self.timeout = 10
name = "quotes"
allowed_domains = ["finance.yahoo.com"]
url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
s = urllib.request.urlopen(url_nasdaq).read().decode('ascii')
s1 = s.split('\r\n')[1:-2]
namelist = []
for item in s1:
if "NASDAQ TEST STOCK" not in item:namelist.append(item)
s2 = [s.split('|')[0] for s in namelist]
s3=[]
for symbol in s2:
if "." not in symbol :
s3.append(symbol)
start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s2]
def parse(self, response):
content = response.body
target = response.url
#doing somthing ,omitted code
To save it as test.py and to run it with scrapy runspider test.py
.
Now i want to wrap all the codes creating the start_urls.
My try here.
class TestSpider(scrapy.Spider):
def __init__(self, *args, **kw):
self.timeout = 10
url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
s = urllib.request.urlopen(url_nasdaq).read().decode('ascii')
s1 = s.split('\r\n')[1:-2]
namelist = []
for item in s1:
if "NASDAQ TEST STOCK" not in item : namelist.append(item)
s2 = [s.split('|')[0] for s in namelist]
s3=[]
for symbol in s2:
if "." not in symbol : s3.append(symbol)
self.start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s3]
It can't work.
This is what start_requests
method of spider is for. It serves the purpose of creating initial set of requests. Building on your example, it would read as:
class TestSpider(scrapy.Spider):
def __init__(self, *args, **kw):
self.timeout = 10
def start_requests(self):
url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
s = urllib.request.urlopen(url_nasdaq).read().decode('ascii')
s1 = s.split('\r\n')[1:-2]
namelist = []
for item in s1:
if "NASDAQ TEST STOCK" not in item : namelist.append(item)
s2 = [s.split('|')[0] for s in namelist]
s3=[]
for symbol in s2:
if "." not in symbol : s3.append(symbol)
for s in s3:
yield scrapy.Request("https://finance.yahoo.com/quote/"+s+"/financials?p="+s, callback=self.parse)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.