![](/img/trans.png)
[英]Combine Pytorch ImageFolder dataset with custom Pytorch dataset
[英]Pytorch custom dataset is super slow
在訓練期間,加載一批數據需要很長時間。 什么會導致這個問題? 我是 Pytorch 的新手,我一直在使用 tensorflow,這是我第一次嘗試創建這樣的東西。 我編寫了一個自定義數據集,它從文件夾中獲取圖像,它存儲在 dataframe 中,它將被拆分為訓練驗證集。
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, root, split, train_ratio = 0.85, val_ratio = 0.1, transform=None):
self.root = root
self.train_ratio = train_ratio
self.val_ratio = val_ratio
self.test_ratio = 1 - (self.train_ratio + self.val_ratio)
df = self.folder2pandas()
self.split = split
self.data = self.splitDf(df)
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
row = self.data.iloc[[idx]]
x = row.values[0][0]
y = row.values[0][1]
x = cv2.imread(x)
if self.sourceTransform:
x = self.sourceTransform(x)
return x, y
def folder2pandas(self):
tuples = []
for folder, subs, files in os.walk(self.root):
for filename in files:
path = os.path.abspath(os.path.join(folder, filename))
tuples.append((path, folder.split('\\')[-1]))
return pd.DataFrame(tuples, columns=["x", "y"])
def splitDf(self, df):
df = df.sort_values(by=['x'], ascending=True).reset_index(drop=True)
train_idxs = df.loc[range(0, int(self.train_ratio * len(df)))]
val_idxs = df.loc[range(int(self.train_ratio * len(df)),
int(self.train_ratio * len(df)) + int(self.val_ratio * len(df)))]
test_idxs = df.loc[range(
int(self.train_ratio * len(df)) + int(self.val_ratio * len(df)), len(df))]
if self.split == 'train':
return train_idxs
elif self.split == 'val':
return val_idxs
elif self.split == 'test':
return test_idxs
增強:
train_transforms = transforms.Compose([
transforms.Resize((224,224)),
transforms.RandomChoice([
transforms.RandomAutocontrast(),
transforms.ColorJitter(brightness=0.3, contrast=0.5, saturation=0.1, hue=0.1),
transforms.GaussianBlur(kernel_size=(5,5), sigma=(0.1, 2.0)),
transforms.Grayscale(num_output_channels=3),
transforms.RandomVerticalFlip(),]),
transforms.RandomHorizontalFlip(0.5),
transforms.ToTensor(),
transforms.Normalize(res[0].numpy(), res[1].numpy()),
])
val_transforms = transforms.Compose([
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize(res[0].numpy(), res[1].numpy()),
])
初始化數據集:
在“資源”文件夾中有兩個文件夾,名稱代表標簽(BinaryClassification)。
train_set=CustomDataset(root="resources/",split='train', transform=train_transforms)
val_set=CustomDataset(root="resources/",split='val', transform=val_transforms)
將數據集提供給數據加載器:
trainloader = torch.utils.data.DataLoader(train_set, shuffle = True, batch_size=32, num_workers=4)
testloader = torch.utils.data.DataLoader(val_set, shuffle = True, batch_size=32, num_workers=4)
以更簡潔的方式解決評論:
創建幾個工人需要大量時間。 似乎在 windows 上,進程的創建在時間上可能會有奇怪的行為。
由於沒有調用__getitem__()
,問題不在於數據加載本身,請嘗試刪除num_workers
參數。
testloader = torch.utils.data.DataLoader(val_set, shuffle = True, batch_size=32)
然后,如果這有效,請嘗試增加它並檢查行為。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.