[英]Pytorch MNIST autoencoder to learn 10-digit classification
I'm trying to build a simple autoencoder for MNIST, where the middle layer is just 10 neurons.我正在尝试为 MNIST 构建一个简单的自动编码器,其中中间层只有 10 个神经元。 My hope is that it will learn to classify the 10 digits, and I assume that would lead to the lowest error in the end (wrt reproducing the original image).
我希望它能学会对 10 位数字进行分类,我认为这最终会导致最低的错误(wrt 重现原始图像)。
I have the following code, which I've already played around with a fair amount.我有以下代码,我已经使用了相当多的代码。 If I run it for up-to 100 epochs, the loss doesn't really go below 1.0, and if I evaluate it, it's obviously not working.
如果我运行它多达 100 个 epochs,损失实际上并没有低于 1.0 go,如果我评估它,它显然不起作用。 What am I missing?
我错过了什么?
Training:训练:
import torch
import torchvision as tv
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision.utils import save_image
num_epochs = 100
batch_size = 64
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
trainset = tv.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
dataloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4)
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder,self).__init__()
self.encoder = nn.Sequential(
# 28 x 28
nn.Conv2d(1, 4, kernel_size=5),
nn.Dropout2d(p=0.2),
# 4 x 24 x 24
nn.ReLU(True),
nn.Conv2d(4, 8, kernel_size=5),
nn.Dropout2d(p=0.2),
# 8 x 20 x 20 = 3200
nn.ReLU(True),
nn.Flatten(),
nn.Linear(3200, 10),
nn.ReLU(True),
# 10
nn.Softmax(),
# 10
)
self.decoder = nn.Sequential(
# 10
nn.Linear(10, 400),
nn.ReLU(True),
# 400
nn.Unflatten(1, (1, 20, 20)),
# 20 x 20
nn.Dropout2d(p=0.2),
nn.ConvTranspose2d(1, 10, kernel_size=5),
# 24 x 24
nn.ReLU(True),
nn.Dropout2d(p=0.2),
nn.ConvTranspose2d(10, 1, kernel_size=5),
# 28 x 28
nn.ReLU(True),
nn.Sigmoid(),
)
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
model = Autoencoder().cpu()
distance = nn.MSELoss()
#optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
for epoch in range(num_epochs):
for data in dataloader:
img, _ = data
img = Variable(img).cpu()
output = model(img)
loss = distance(output, img)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('epoch [{}/{}], loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
Already the training loss indicates that the thing is not working, but printing out the confusion matrix (which in this case should not necessarily be the identity matrix, since the neurons can be ordered arbitrarily, but should be row-col-reordarable and approximate the identity, if this would work):训练损失已经表明它不起作用,但打印出混淆矩阵(在这种情况下不一定是单位矩阵,因为神经元可以任意排序,但应该是行列可重新排序的并且近似于身份,如果这可行的话):
import numpy as np
confusion_matrix = np.zeros((10, 10))
batch_size = 20*1000
testset = tv.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
dataloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=4)
for data in dataloader:
imgs, labels = data
imgs = Variable(imgs).cpu()
encs = model.encoder(imgs).detach().numpy()
for i in range(len(encs)):
predicted = np.argmax(encs[i])
actual = labels[i]
confusion_matrix[actual][predicted] += 1
print(confusion_matrix)
I was able to bring your code to a version where it would at least converge.我能够将您的代码带到至少会收敛的版本。 In summary, I think there might be multiple problems with it: the normalization (why those values?), some unnecessary relus, too high learning rate, MSE loss instead of cross-entropy and mainly I don't think the softmax in the bottleneck layer works that way for vanishing gradient reasons, see here
总之,我认为它可能存在多个问题:归一化(为什么是这些值?),一些不必要的 relus,太高的学习率,MSE 损失而不是交叉熵,主要是我不认为 softmax 在瓶颈由于梯度消失的原因,图层以这种方式工作,请参见此处
https://www.quora.com/Does-anyone-ever-use-a-softmax-layer-mid-neural-network-rather-than-at-the-end https://www.quora.com/Does-anyone-ever-use-a-softmax-layer-mid-neural-network-rather-than-at-the-end
Maybe one could fix this using the Gumbel softmax: https://arxiv.org/abs/1611.01144也许可以使用 Gumbel softmax 解决这个问题: https://arxiv.org/abs/1611.01144
Moreover, there are papers already achieving this, but as a Variational Autoencoder rather than a vanilla autoencoder, see here: https://arxiv.org/abs/1609.02200 .此外,已经有论文实现了这一点,但作为变分自动编码器而不是普通自动编码器,请参见此处: https://arxiv.org/abs/1609.02200 。
For now you can use this modification, which at least converges and then modify step-by-step and see what breaks it.现在你可以使用这个修改,它至少会收敛,然后逐步修改,看看是什么破坏了它。
As for the classification, the standard way would be to use the trained encoder to generate features from images and then use a normal classifier (SVG or so) on top of that.至于分类,标准方法是使用经过训练的编码器从图像中生成特征,然后在其上使用普通分类器(SVG 左右)。
batch_size = 16
transform = transforms.Compose([
transforms.ToTensor(),
])
trainset = MNIST(root='./data/', train=True, download=True, transform=transform)
dataloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=8)
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder,self).__init__()
self.encoder = nn.Sequential(
nn.Conv2d(1, 2, kernel_size=5),
nn.ReLU(),
nn.Conv2d(2, 4, kernel_size=5),
)
self.decoder = nn.Sequential(
nn.ConvTranspose2d(4, 10, kernel_size=5),
nn.ReLU(),
nn.ConvTranspose2d(10, 1, kernel_size=5),
nn.Sigmoid(),
)
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
model = Autoencoder().cpu()
distance = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-5)
num_epochs = 20
outputs = []
for epoch in tqdm(range(num_epochs)):
for data in dataloader:
img, _ = data
img = Variable(img).cpu()
output = model(img)
loss = distance(output, img)
optimizer.zero_grad()
loss.backward()
optimizer.step()
outputs.append(output)
print('epoch [{}/{}], loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
import matplotlib.pyplot as plt
% plotting epoch outputs
for k in range(0, 20):
plt.figure(figsize=(9, 2))
imgs = outputs[k].detach().numpy()
for i, item in enumerate(imgs):
plt.imshow(item[0])
plt.title(str(i))
plt.show()
Autoencoder is technically not used as a classifier in general.自动编码器在技术上一般不用作分类器。 They learn how to encode a given image into a short vector and reconstruct the same image from the encoded vector.
他们学习如何将给定的图像编码为一个短向量,并从编码的向量中重建相同的图像。 It is a way of compressing image into a short vector:
这是一种将图像压缩成短向量的方法:
Since you want to train autoencoder with classification capabilities, we need to make some changes to model.由于要训练具有分类能力的自动编码器,我们需要对 model 进行一些更改。 First of all, there will be two different losses:
首先,会有两种不同的损失:
I've done a couple of changes to your code to get the combined model working.我对您的代码进行了一些更改,以使组合的 model 正常工作。 Firstly, let's see the code:
首先,让我们看一下代码:
import torch
import torchvision as tv
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision.utils import save_image
num_epochs = 10
batch_size = 64
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
trainset = tv.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
testset = tv.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
dataloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True, num_workers=4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Autoencoderv3(nn.Module):
def __init__(self):
super(Autoencoderv3,self).__init__()
self.encoder = nn.Sequential(
nn.Conv2d(1, 4, kernel_size=5),
nn.Dropout2d(p=0.1),
nn.ReLU(True),
nn.Conv2d(4, 8, kernel_size=5),
nn.Dropout2d(p=0.1),
nn.ReLU(True),
nn.Flatten(),
nn.Linear(3200, 10)
)
self.softmax = nn.Softmax(dim=1)
self.decoder = nn.Sequential(
nn.Linear(10, 400),
nn.ReLU(True),
nn.Unflatten(1, (1, 20, 20)),
nn.Dropout2d(p=0.1),
nn.ConvTranspose2d(1, 10, kernel_size=5),
nn.ReLU(True),
nn.Dropout2d(p=0.1),
nn.ConvTranspose2d(10, 1, kernel_size=5)
)
def forward(self, x):
out_en = self.encoder(x)
out = self.softmax(out_en)
out = self.decoder(out)
return out, out_en
model = Autoencoderv3().to(device)
distance = nn.MSELoss()
class_loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
mse_multp = 0.5
cls_multp = 0.5
model.train()
for epoch in range(num_epochs):
total_mseloss = 0.0
total_clsloss = 0.0
for ind, data in enumerate(dataloader):
img, labels = data[0].to(device), data[1].to(device)
output, output_en = model(img)
loss_mse = distance(output, img)
loss_cls = class_loss(output_en, labels)
loss = (mse_multp * loss_mse) + (cls_multp * loss_cls) # Combine two losses together
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Track this epoch's loss
total_mseloss += loss_mse.item()
total_clsloss += loss_cls.item()
# Check accuracy on test set after each epoch:
model.eval() # Turn off dropout in evaluation mode
acc = 0.0
total_samples = 0
for data in testloader:
# We only care about the 10 dimensional encoder output for classification
img, labels = data[0].to(device), data[1].to(device)
_, output_en = model(img)
# output_en contains 10 values for each input, apply softmax to calculate class probabilities
prob = nn.functional.softmax(output_en, dim = 1)
pred = torch.max(prob, dim=1)[1].detach().cpu().numpy() # Max prob assigned to class
acc += (pred == labels.cpu().numpy()).sum()
total_samples += labels.shape[0]
model.train() # Enables dropout back again
print('epoch [{}/{}], loss_mse: {:.4f} loss_cls: {:.4f} Acc on test: {:.4f}'.format(epoch+1, num_epochs, total_mseloss / len(dataloader), total_clsloss / len(dataloader), acc / total_samples))
This code should now train the model both as a classifier and a generative autoencoder.此代码现在应该将 model 训练为分类器和生成自动编码器。 In general though, this type of approach can be a bit tricky to get the model training.
但总的来说,这种方法可能有点难以获得 model 训练。 In this case, MNIST data is simple enough to get those two complementary losses train together.
在这种情况下,MNIST 数据很简单,可以将这两个互补的损失训练在一起。 In more complex cases like Generative Adversarial Networks (GAN), they apply model training switching, freezing one model etc. to get whole model trained.
在生成对抗网络 (GAN) 等更复杂的情况下,他们应用 model 训练切换、冻结一个 model 等来获得整个 model 训练。 This autoencoder model trains easily on MNIST without doing those types of tricks:
这个自动编码器 model 在 MNIST 上轻松训练,无需使用这些类型的技巧:
epoch [1/10], loss_mse: 0.8928 loss_cls: 0.4627 Acc on test: 0.9463
epoch [2/10], loss_mse: 0.8287 loss_cls: 0.2105 Acc on test: 0.9639
epoch [3/10], loss_mse: 0.7803 loss_cls: 0.1574 Acc on test: 0.9737
epoch [4/10], loss_mse: 0.7513 loss_cls: 0.1290 Acc on test: 0.9764
epoch [5/10], loss_mse: 0.7298 loss_cls: 0.1117 Acc on test: 0.9762
epoch [6/10], loss_mse: 0.7110 loss_cls: 0.1017 Acc on test: 0.9801
epoch [7/10], loss_mse: 0.6962 loss_cls: 0.0920 Acc on test: 0.9794
epoch [8/10], loss_mse: 0.6824 loss_cls: 0.0859 Acc on test: 0.9806
epoch [9/10], loss_mse: 0.6733 loss_cls: 0.0797 Acc on test: 0.9814
epoch [10/10], loss_mse: 0.6671 loss_cls: 0.0764 Acc on test: 0.9813
As you can see, both mse loss and classification loss is decreasing, and accuracy on test set is increasing.如您所见,mse 损失和分类损失都在减少,并且测试集的准确度在增加。 In the code, MSE loss and classification loss are added together.
在代码中,MSE 损失和分类损失相加。 This means respective gradients calculated from each loss are fighting against each other to force the network into their direction.
这意味着从每个损失计算的各个梯度正在相互对抗以迫使网络进入它们的方向。 I've added loss multiplier to control the contribution from each loss.
我添加了损失乘数来控制每个损失的贡献。 If MSE has a higher multiplier, network will have more gradients from MSE loss, meaning it will better learn reconstruction, if CLS loss has a higher multiplier, network will get better classification accuracies.
如果 MSE 具有更高的乘数,网络将从 MSE 损失中获得更多的梯度,这意味着它将更好地学习重建,如果 CLS 损失具有更高的乘数,网络将获得更好的分类精度。 You can play with those multiplier to see how end result is changing, but MNIST is a very easy dataset so differences might be hard to see maybe.
您可以使用这些乘数来查看最终结果如何变化,但 MNIST 是一个非常简单的数据集,因此可能很难看出差异。 Currently, it doesn't do too bad at reconstructing inputs:
目前,它在重建输入方面做得还不错:
import numpy as np
import matplotlib.pyplot as plt
model.eval()
img, labels = list(dataloader)[0]
img = img.to(device)
output, output_en = model(img)
inp = img[0:10, 0, :, :].squeeze().detach().cpu()
out = output[0:10, 0, :, :].squeeze().detach().cpu()
# Just some trick to concatenate first ten images next to each other
inp = inp.permute(1,0,2).reshape(28, -1).numpy()
out = out.permute(1,0,2).reshape(28, -1).numpy()
combined = np.vstack([inp, out])
plt.imshow(combined)
plt.show()
I am sure with more training and fine tuning loss multipliers, you can get better results.我相信通过更多的训练和微调损失乘数,你可以获得更好的结果。
Lastly, decoder receives softmax of encoder output.最后,解码器接收编码器 output 的 softmax。 This mean decoder tries to create output image from 0 - 1 probabilities of the input.
这意味着解码器尝试从输入的 0 - 1 概率创建 output 图像。 So if the softmax probability vector is 0.98 at input location 0 and close to zero elsewhere, decoder should output an image that looks like 0.0.
因此,如果 softmax 概率向量在输入位置 0 处为 0.98,而在其他位置接近于零,则解码器应该 output 看起来像 0.0 的图像。 Here I give network input to create 0 to 9 reconstructions:
在这里,我给出网络输入以创建 0 到 9 个重建:
test_arr = np.zeros([10, 10], dtype = np.float32)
ind = np.arange(0, 10)
test_arr[ind, ind] = 1.0
model.eval()
img = torch.from_numpy(test_arr).to(device)
out = model.decoder(img)
out = out[0:10, 0, :, :].squeeze().detach().cpu()
out = out.permute(1,0,2).reshape(28, -1).numpy()
plt.imshow(out)
plt.show()
I've also done a few small changes in the code, printing epoch average loss etc. which doesn't really change the training logic, so you can see those changes in the code and let me know if anything looks weird.我还对代码进行了一些小的更改,打印 epoch average loss 等,这并没有真正改变训练逻辑,所以你可以在代码中看到这些变化,如果有什么看起来很奇怪,请告诉我。
I played around with your code (from above and Github ) and found the following:我玩弄了你的代码(来自上面和Github )并发现了以下内容:
Sigmoid: when your code loads the MNIST dataset, you apply a Transform to normalize the data, but your Autoencoder model uses nn.Sigmoid()
as its final layer, which forces the data to be in the range of [0, 1] (but the normalized data is more like [-.4242, 2.8215]. Commenting-out the sigmoid layer helps greatly reduce the loss during training. Sigmoid:当你的代码加载 MNIST 数据集时,你应用了一个 Transform 来规范化数据,但是你的自动编码器 model 使用
nn.Sigmoid()
作为它的最后一层,它强制数据在 [0, 1] (但归一化后的数据更像是 [-.4242, 2.8215]。注释掉 sigmoid 层有助于大大减少训练过程中的损失。
Softmax: I understand why you include the nn.Softmax()
layer - to try and force the learned 10 features to be used sparsely for reconstructing each image. Softmax:我理解你为什么包含
nn.Softmax()
层——试图强制稀疏地使用学习到的 10 个特征来重建每个图像。 It does help raise the test accuracy in some cases.在某些情况下,它确实有助于提高测试准确性。 After trying a few ideas (like annealing a softmax temperature), it feels like a single float to reconstruct each class of digit is just insufficient.
在尝试了一些想法(比如对 softmax 温度进行退火)之后,感觉用单个浮点数来重构每个数字 class 是不够的。
Clustering: another way to use the features to predict 1 of 10 positions for each image is by clustering the feature representations (over some set of training/dev samples).聚类:另一种使用特征来预测每个图像 10 个位置中的一个的方法是对特征表示进行聚类(在一些训练/开发样本集上)。 I tried this and found it help raised the test accuracy.
我试过了,发现它有助于提高测试的准确性。
CNN: I found a different CNN AE model from here that works a little bit better in the experiments I ran. CNN:我从这里发现了一个不同的 CNN AE model,它在我运行的实验中效果更好一些。
Optimizer: I found that the Adam optimizer with LR=.001
works better that the ad-hoc values I tried with SGD, Adam, and Adadelta.优化器:我发现
LR=.001
的 Adam 优化器比我尝试使用 SGD、Adam 和 Adadelta 的临时值效果更好。
Finally, I found that wrapping the img with Variable()
is not needed, so I removed that.最后,我发现不需要用
Variable()
包装 img,所以我删除了它。
Below is the final code I ended up with.下面是我最终得到的最终代码。 After 25 epochs of training:
经过 25 个 epoch 的训练:
Training to 100 epochs doesn't seem to improve things.训练到 100 个 epoch 似乎并没有改善事情。 Here is a sample of the before/after digits at epoch 8 where the loss=.0436:
这是第 8 个时期前/后数字的示例,其中损失 =.0436:
import os
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
# # work around sklearn warning
os.environ["OMP_NUM_THREADS"] = "4"
import sklearn.cluster as cluster
import torch
import torchvision as tv
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class Autoencoder(nn.Module):
def __init__(self, d_hidden=10, use_softmax=False):
super(Autoencoder,self).__init__()
self.encoder = nn.Sequential(
# 28 x 28
nn.Conv2d(1, 4, kernel_size=5),
# 4 x 24 x 24
nn.ReLU(True),
nn.Conv2d(4, 8, kernel_size=5),
nn.ReLU(True),
# 8 x 20 x 20 = 3200
nn.Flatten(),
nn.Linear(3200, d_hidden),
# d_hidden
#nn.Softmax(dim=-1),
)
self.use_softmax = use_softmax
self.decoder = nn.Sequential(
# d_hidden
nn.Linear(d_hidden, 400),
# 400
nn.ReLU(True),
nn.Linear(400, 4000),
# 4000
nn.ReLU(True),
nn.Unflatten(1, (10, 20, 20)),
# 10 x 20 x 20
nn.ConvTranspose2d(10, 10, kernel_size=5),
# 24 x 24
nn.ConvTranspose2d(10, 1, kernel_size=5),
# 28 x 28
#nn.Sigmoid(),
)
def forward(self, x, temperature):
features = self.encoder(x)
if self.use_softmax:
features = torch.softmax(features/temperature, dim=-1)
output = self.decoder(features)
return output
def get_features(self, x):
features = self.encoder(x)
return features
class NewAutoencoder(nn.Module):
def __init__(self, d_hidden=64, use_softmax=False):
super(NewAutoencoder, self).__init__()
self.encoder = nn.Sequential( # like the Composition layer you built
nn.Conv2d(1, 16, 3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(16, 32, 3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(32, d_hidden, 7)
)
if use_softmax:
self.encoder.add_module("softmax", nn.Softmax(dim=-1))
self.decoder = nn.Sequential(
nn.ConvTranspose2d(d_hidden, 32, 7),
nn.ReLU(),
nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1),
nn.ReLU(),
nn.ConvTranspose2d(16, 1, 3, stride=2, padding=1, output_padding=1),
#nn.Sigmoid()
)
def forward(self, x, temperature):
x = self.encoder(x)
x = self.decoder(x)
return x
def get_features(self, x):
output = self.encoder(x).squeeze(-1).squeeze(-1)
return output
def show_samples(model, dataloader, device, epoch, temperature):
model.eval()
img, labels = list(dataloader)[0]
img = img.to(device)
output = model(img, temperature)
inp = img[0:10, 0, :, :].squeeze().detach().cpu()
out = output[0:10, 0, :, :].squeeze().detach().cpu()
# Just some trick to concatenate first ten images next to each other
inp = inp.permute(1,0,2).reshape(28, -1).numpy()
out = out.permute(1,0,2).reshape(28, -1).numpy()
combined = np.vstack([inp, out])
plt.title("epoch: {}".format(epoch))
plt.imshow(combined)
plt.draw()
plt.pause(0.1)
def calc_position_to_label_mapping(model, alignloader, device, quick, cluster_map):
model.eval()
position_counts_by_label = defaultdict(dict) # key=label, value=dict(key=position, value-count)
labels_by_position = {key:None for key in range(10)}
# collect counts of each position, by label
for images, labels in alignloader:
images = images.to(device)
output = model.get_features(images)
if cluster_map:
feature_data = output.cpu().detach().numpy()
preds = cluster_map.predict(feature_data)
else:
preds = torch.argmax(output, dim=-1)
for lab, pred in zip(labels, preds):
label = int(lab)
position = int(pred)
pc = position_counts_by_label[label]
if position in pc:
pc[position] += 1
else:
pc[position] = 1
'''
Note: at this point, we could have more a particular position
being the best predictor of more than 1 label. Since each position can only
predict a single label, we want to choose the overall assignments of position -> label
that will maximize our accuracy. The below algorithm estimates this best assignment:
- normalize all counts by label
- for the remaining labels/positions:
- find position->label assigment with greatest difference between top scoring #1 assignment and #2
assignment within a label, across all labels
- record that position->label assignment and remove position and assignment from pool.
- repeat above step until all labels have been assigned a (unique) position.
'''
# normalize counts by label
for label, pc in position_counts_by_label.items():
total = sum(pc.values())
for key in pc:
pc[key] /= total
# repeat until done
remaining_positions = {key:1 for key in range(10)}
while position_counts_by_label:
# find strongest position/label assignment
best = None
for label, pc in position_counts_by_label.items():
if len(pc) == 0:
# no remaining positions predicted this label
position = next(iter(remaining_positions))
best = (label, position, 1)
break
if len(pc) == 1:
# automatic winner
best = (label, next(iter(pc)), 1)
break
pcx = dict(pc)
key1 = max(pcx, key=pcx.get)
del pcx[key1]
key2 = max(pcx, key=pcx.get)
diff = pc[key1] - pc[key2]
#diff = pc[key1]
if best is None or diff > best[2]:
best = (label, key1, diff)
# record chosen position/label
label, position, score = best
labels_by_position[position] = label
# remove position/label from pool
del position_counts_by_label[label]
del remaining_positions[position]
for pc in position_counts_by_label.values():
if position in pc:
del pc[position]
return labels_by_position
def cluster_features(model, alignloader, device, quick):
all_features = None
for images, unused_labels in alignloader:
images = images.to(device)
features = model.get_features(images)
if all_features is None:
all_features = features
else:
all_features = torch.vstack( [all_features, features] )
feature_data = all_features.cpu().detach().numpy()
kmeans = cluster.KMeans(n_clusters=10)
kmeans.fit(feature_data)
return kmeans
def eval_test(model, testloader, device, quick, labels_by_position, cluster_map):
correct = 0
samples = 0
for images, labels in testloader:
images = images.to(device)
output = model.get_features(images)
if cluster_map:
feature_data = output.cpu().detach().numpy()
preds = cluster_map.predict(feature_data)
else:
preds = torch.argmax(output, dim=-1)
for lab, pred in zip(labels, preds):
label = int(lab)
position = int(pred)
pred_label = labels_by_position[position]
if label == pred_label:
correct += 1
samples += 1
print("labels_by_position: {}".format(list(labels_by_position.values())))
test_acc = correct/samples
name = "Estimated" if quick else "Total"
print("{} test acc: {:.4f} (samples: {:,})".format(name, test_acc, len(testloader.dataset)))
def evaluate(model, testloader, device, quick, use_clustering):
if use_clustering:
cluster_map = cluster_features(model, testloader, device, True)
labels_by_position = calc_position_to_label_mapping(model, testloader, device, True, cluster_map)
eval_test(model, testloader, device, True, labels_by_position, cluster_map)
else:
labels_by_position = calc_position_to_label_mapping(model, testloader, device, True, None)
eval_test(model, testloader, device, True, labels_by_position, None)
def train():
num_epochs = 25
batch_size = 64
test_samples = 1000
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
trainset = tv.datasets.MNIST(root='d:/.data/mnist', train=True, download=True, transform=transform)
testset = tv.datasets.MNIST(root='d:/.data/mnist', train=False, download=True, transform=transform)
indexes = list(range(test_samples))
quick_testset = torch.utils.data.Subset(testset, indexes)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
quick_testloader = torch.utils.data.DataLoader(quick_testset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda")
use_orig_ae = False
use_softmax = False
use_clustering = True
d_hidden = 32 if use_clustering else 10
plot_before_after_digits = True
if use_orig_ae:
model = Autoencoder(d_hidden=d_hidden, use_softmax=use_softmax).to(device)
else:
model = NewAutoencoder(d_hidden=d_hidden, use_softmax=use_softmax).to(device)
temperature = 1
distance = nn.MSELoss()
#distance = nn.L1Loss()
#optimizer = torch.optim.SGD(model.parameters(), lr=.05) # , lr=.01, momentum=0.5)
#optimizer = torch.optim.Adadelta(model.parameters(), lr=1)
optimizer = torch.optim.Adam(model.parameters(), lr=.001)
for epoch in range(num_epochs):
model.train()
for data in trainloader:
optimizer.zero_grad()
img, _ = data
#img = Variable(img).to(device)
img = img.to(device)
output = model(img, temperature)
loss = distance(output, img)
loss.backward()
optimizer.step()
print('epoch [{}/{}], loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
evaluate(model, quick_testloader, device, True, use_clustering)
print()
if plot_before_after_digits:
show_samples(model, trainloader, device, epoch+1, temperature)
temperature = .9*temperature
# after training, do final (and full) eval
evaluate(model, testloader, device, False, use_clustering)
_ = input("hit RETURN to dismiss plot and end program")
if __name__ == "__main__":
train()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.