I implemented a smo algorithm in python. As I just do it as a practice so I didn't use some scientific computing library such as numpy and scipy . I just want it work correct。 But when I test my code on diabetes , it keeps running for a week! I have checked my code many times and I also found some errors. but after I correct these errors, the code still runs too slow. I don't know if there are some errors that I didn't checked out or the smo is just inherently so slow.
So is there some common errors that could make code runs slow? I wrote my program referring to the pseudo-code of the smo paper Thanks very much.
Below is my code.
#encoding=utf8
import math
import random
class SVM(object):
def __init__(self, dataset, target, C=0.001, tolerance=0.001):
self.dataset=dataset
self.target=target
self.C=C
self.tolerance=tolerance
self.alpha=[0.0 for i in range(len(dataset))]
self.E={}
self.b=0.0
self.w=[0.0 for i in range(len(dataset[0]))]
def train(self):
numChanged=0
exampleAll=1
trainset_size=len(self.dataset)
iter=0
while numChanged > 0 or exampleAll:
numChanged=0
if exampleAll:
for i in range(trainset_size):
numChanged+=self.examineExample(i)
iter+=1
else:
for i in range(trainset_size):
if self.alpha[i] > 0 and self.alpha[i] < self.C:
numChanged+=self.examineExample(i)
iter+=1
if exampleAll:
exampleAll=0
elif numChanged == 0:
exampleAll=1
print "iter", iter
print "alpha", "\t".join([str(i) for i in self.alpha])
print "target", "\t".join(self.target)
for j in range(len(self.trainset[0])):
for i in range(trainset_size):
self.w[j] +=self.alpha[i]*int(self.target[i])*float(self.dataset[i][j])
def examineExample(self, i2):
print "in examineExample", i2
print "alpha", "\t".join([str(i) for i in self.alpha])
alpha2=self.alpha[i2]
y2=int(self.target[i2])
e2=self.calculateE(i2)
r2=e2*y2
print "r2", r2
if r2 < -self.tolerance and self.alpha[i2] < self.C or r2 > self.tolerance and self.alpha[i2] > 0: #i2违反了kkt条件
i1=self.select_i1(i2,e2)
if self.takeStep(i1, i2):
return 1
else:
all_sample_index=[i for i in range(len(self.dataset)) ]
random.shuffle(all_sample_index)
for k in range(len(all_sample_index)):
i1=all_sample_index[k]
if self.alpha[i1] > 0 and self.alpha[i1] < self.C:
if self.takeStep(i1, i2):
return 1
random.shuffle(all_sample_index)
for k in range(len(all_sample_index)):
i1=all_sample_index[k]
if self.takeStep(i1,i2):
return 1
return 0
def takeStep(self, i1, i2):
print "in takeStep", i1, i2
if i1==i2:
return 0
alpha1=self.alpha[i1]
y1=int(self.target[i1])
e1=self.calculateE(i1)
alpha2=self.alpha[i2]
y2=int(self.target[i2])
e2=self.calculateE(i2)
s=y1*y2
if y1 != y2:
L=max(0, alpha2-alpha1)
H=min(self.C, self.C+alpha2-alpha1)
if y1== y2:
L=max(0, alpha2+alpha1-self.C)
H=min(self.C, alpha2+alpha1)
if L==H:
return 0
k11=self.kernel(i1, i1)
k12=self.kernel(i1, i2)
k22=self.kernel(i2, i2)
eta=k11+k22-2*k12
if eta > 0:
self.alpha[i2]=alpha2+y2*(e1-e2)/eta
if self.alpha[i2] < L:
self.alpha[i2]=L
if self.alpha[i2] >H:
self.alpha[i2]=H
print "abs", abs(self.alpha[i2] - alpha2)
if abs(self.alpha[i2] - alpha2) < 0.00001
return 0
self.alpha[i1]=alpha1+s*(alpha2-self.alpha[i2])
b1=self.b-e1-y1*(self.alpha[i1]-alpha1)*self.kernel(i1,i1)-y2*(self.alpha[i2]-alpha2)*self.kernel(i1,i2)
b2=self.b-e2-y1*(self.alpha[i1]-alpha1)*self.kernel(i1,i2)-y2*(self.alpha[i2]-alpha2)*self.kernel(i2,i2)
print "two old alpha", alpha1, alpha2
print "two alpha", self.alpha[i1] ,self.alpha[i2]
if self.alpha[i1] >0 and self.alpha[i1] < self.C and self.alpha[i2] > 0 and self.alpha[i2] < self.C:
print "two b", b1, b2
if self.alpha[i1] >0 and self.alpha[i1] < self.C:
self.b=b1
elif self.alpha[i2] > 0 and self.alpha[i2] < self.C:
self.b=b2
else:
self.b=(b1+b2)/2
self.E[i2]=self.calculateE(i2)
self.E[i1]=self.calculateE(i1)
return 1
else:
return 0
def select_i1(self, i, Ei ):
maxK=-1;
maxDeltaE=0.0
Ej=0
self.E[i]=Ei
for k in range(len(self.dataset)):
if self.alpha[k] > 0 and self.alpha[k] < self.C:
Ek=self.calculateE(k)
deltaE=Ek-Ei
if abs(deltaE) > maxDeltaE:
maxK=k
maxDeltaE=deltaE
Ej=Ek
if maxK != -1:
return maxK
else:
j=i
while j == i:
j=random.randint(0, len(self.dataset))
return j
def calculateE(self, i):
f_x=0.0
trainset_size=len(self.dataset)
for k in range(trainset_size):
f_x+=(self.alpha[k]*int(self.target[k])*self.kernel(k,i))
f_x+=self.b
e_x=f_x-float(self.target[i])
return e_x
def kernel(self, i, j):
return sum([float(self.dataset[i][k])*float(self.dataset[j][k]) for k in range(len(self.dataset[i]))])
def test(self, testset, testset_target):
precision=0.0
correct=0
for k in range(len(testset)):
sample =testset[k]
pred_value=0.0
for i in range(len(sample)):
pred_value+=self.w[i]*sample[i]
pred_value+=self.b
if pred_value >= 0:
label=1
else:
label=-1
if testset_target[k] == label:
correct+=1
precision=correct/(float(len(testset_target)))
return precision
def read_libsvm_format_file(dataset_filename):
dataset_file=file(dataset_filename,'r')
dataset_label=[]
dataset=[]
for line in dataset_file:
splitted=line.strip().split()
dataset_label.append(splitted[0])
sample=[]
for i in range(1,len(splitted)):
index_value=splitted[i].split(":")
sample.append(index_value[1])
dataset.append(sample)
return dataset, dataset_label
if __name__ == "__main__":
dataset, target =read_libsvm_format_file('diabetes')
trainset_size=500
index=range(len(dataset))
random.shuffle(index)
trainset=[ dataset[index[i]] for i in range(trainset_size) ]
trainset_target=[ target[index[i]] for i in range(trainset_size) ]
testset=[ dataset[index[i]] for i in range(trainset_size, len(index)) ]
testset_target=[ target[index[i]] for i in range(trainset_size, len(index)) ]
svm=SVM(dataset, target)
svm.train()
Unfortunately, the most common error in scientific python programming that causes code to run slow is... to use pure python. Python loops are slow, period. And by slow I mean extremely slow. Even assuming that eveything is perfectly correct, you will end up with extremely slow optimizer unless you do one of the following:
There is no :
after
if abs(self.alpha[i2] - alpha2) < 0.00001
So it will not even run.
Next, after fixing it and runing on diabetes
it crashes
r2 -0.999256460902 in takeStep 658 2 Traceback (most recent call last): File "a.py", line 218, in <module> svm.train() File "a.py", line 26, in train numChanged+=self.examineExample(i) File "a.py", line 55, in examineExample if self.takeStep(i1, i2): File "a.py", line 79, in takeStep e1=self.calculateE(i1) File "a.py", line 160, in calculateE f_x+=(self.alpha[k]*int(self.target[k])*self.kernel(k,i)) File "a.py", line 167, in kernel return sum([float(self.dataset[i][k])*float(self.dataset[j][k]) for k in range(len(self.dataset[i]))]) IndexError: list index out of range
which is caused by your invalid reading function . libsvm (svmlight) dataformat is sparse , so some of the dimensions might be missing - your code assumes that it does not.
You even read your data as strings
index_value=splitted[i].split(":") sample.append(index_value[1])
It should be (after you preallocate your sample lists so they are big enough to fit data, or use defaultdicts sample = defaultdict(lambda: 0)
).
index_value=splitted[i].split(":") sample[int(index_value[0])-1] = float(index_value[1])
The same applies to reading labels. Consequently you have dozens of completely redundant type conversions in your code (all your current float()
and int()
calls are redundant).
There is also error, in final building of w
:
[...] self.trainset [...] // you do not have a "trainset" field in SVM
In testing code you add intercept ( b
) multiple times
for i in range(len(sample)): pred_value+=self.w[i] * sample[i] pred_value+=self.b
while it should be
for i in range(len(sample)): pred_value+=self.w[i] * sample[i] pred_value+=self.b
I would not be suprised if one would find also many errors in SMO itself, which might cause the algorithm to not converge at all, but for now I only managed to find the above.
after fixing all of the above, removing all debug printing messages and running with pypy
I get following model:
[-0.7725132490683443, -2.8232379861128907, 0.5166865781499452, 0.1494369704938019, 0.1533317981122747, -1.9500615428909012, -0.7957828887451327, -0.12523832631571777]
while scikit-learn gives
[ 0.77296251 2.82387247 -0.51692311 -0.14987696 -0.15312237 1.94999242 0.79593224 0.12527931]
So up to sign it is the same model.
When using C=1
both codes end up with training set accuracy 0.776041666667
real 9m11.017s
pypy
real 0m47.033s
pypy
real 0m40.215s
real 0m0.338s
It appears that most of your errors are located in data reading utilities. Furthermore, as stated at the begining - "classic" python interpreter has extremely slow loops, so you either have to use pypy
(and lack of support for many libraries), or cython
(and more complex development) or at least numerical libraries such as numpy
and scipy
.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.