[英]Inference time varies over different GPUs using Torch
运行以下推理代码时出现错误。 在函数recognize()中,完成预测需要0.4s。 将结果preds_str
返回给调用者函数还需要 3s。 我发现如果我在文件配置中设置gpu_id=0
,它会立即返回。 我该如何修复这个错误? 提前致谢。
def recognize(imgs, model, demo_loader):
t = time()
model.eval()
with torch.no_grad():
for image_tensors, image_path_list in demo_loader:
batch_size = image_tensors.size(0)
image = image_tensors.to(config.device)
# For max length prediction
length_for_pred = torch.IntTensor([config.batch_max_length] * batch_size).to(config.device)
text_for_pred = torch.LongTensor(batch_size, config.batch_max_length + 1).fill_(0).to(config.device)
preds = model(image, text_for_pred, is_train=False)
_, preds_index = preds.max(2)
preds_str = converter.decode(preds_index, length_for_pred)
print('time elapsed before return:'time()-t) #0.4s
return preds_str
def main():
model = Model()
self.model.cuda(config.device)
model = torch.nn.DataParallel(model, device_ids=[config.device], output_device=[config.device]).to(config.device)
model.load_state_dict(torch.load(config.saved_model, map_location=config.device))
AlignCollate_demo = AlignCollate(imgH=config.imgH, imgW=config.imgW, keep_ratio_with_pad=config.PAD)
imgs_dataset = ImageDataset(imgs)
demo_loader = torch.utils.data.DataLoader(imgs_dataset, batch_size=config.batch_size,shuffle=False,num_workers=int(config.workers),collate_fn=AlignCollate_demo, pin_memory=True)
start_time = time()
# imgs = [img1, img2, ....]
preds_str = recognize(imgs, model, demo_loader)
print('time elapsed after return', time()-start_time) #3.4s
配置文件:
class ConfigWordRecognizer:
gpu_id = 1 #troublesome line here
device = torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() else 'cpu')
imgH = 32
imgW = 100
batch_size = 80
workers = 8
batch_max_length = 25
我从这篇文章中找到了解决方案。 我设置了CUDA_VISIBLE_DEVICES=1
, gpu_id=0
。 然后,我删除
model = torch.nn.DataParallel(model, device_ids=[config.device], output_device=[config.device]).to(config.device)
和改变
model.load_state_dict(torch.load(config.saved_model, map_location=config.device))
到
model.load_state_dict(self.copyStateDict(torch.load(self.config.saved_model, map_location=self.config.device)))
复制 stateDict 函数:
def copyStateDict(self, state_dict):
if list(state_dict.keys())[0].startswith("module"):
start_idx = 1
else:
start_idx = 0
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = ".".join(k.split(".")[start_idx:])
new_state_dict[name] = v
return new_state_dict
该模型在gpu1
上运行良好。 但我仍然不明白为什么如果我设置了“gpu_id=0”,它在没有copyStateDict
情况下在gpu0
上gpu0
copyStateDict
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.