如何使用 Keras 實現 CNN-LSTM

Question

我正在嘗試實現一個 CNN-LSTM，該 CNN-LSTM 對代表帕金森病/健康控制患者的語音的梅爾譜圖圖像進行分類。 我正在嘗試使用 LSTM model 實現預先存在的 model (DenseNet-169)，但是我遇到了以下錯誤： ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1]. ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 216, 1]. 誰能告訴我哪里出錯了？

import librosa
import os
import glob
import IPython.display as ipd
from pathlib import Path
import timeit
import time, sys

%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display

import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import cv2
import seaborn as sns

%tensorflow_version 1.x #version 1 works without problems
import tensorflow

from tensorflow.keras import models
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import TimeDistributed

import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, BatchNormalization, Activation, GaussianNoise, LSTM
from sklearn.metrics import accuracy_score

DATA_DIR = Path('/content/drive/MyDrive/PhD_Project_Experiments/Spontaneous_Dialogue_PD_Dataset') 
diagnosis = [x.name for x in DATA_DIR.glob('*') if x.is_dir()]
diagnosis

def create_paths_ds(paths: Path, label: str) -> list:
    EXTENSION_TYPE = '.wav'
    return [(x, label) for x in paths.glob('*' + EXTENSION_TYPE)]

from collections import Counter

categories_to_use = [
    'Parkinsons_Disease',
    'Healthy_Control',
]

NUM_CLASSES = len(categories_to_use)

print(f'Number of classes: {NUM_CLASSES}')

paths_all_labels = []
for cat in categories_to_use:
    paths_all_labels += create_paths_ds(DATA_DIR / cat, cat)
 
X_train, X_test = train_test_split(paths_all_labels,test_size=0.1, stratify = [paths_all_labels[y][1] for y in range(len(paths_all_labels))] ) #fix stratified sampling for test data
X_train, X_val = train_test_split(X_train, test_size=0.2, stratify = [X_train[y][1] for y in range(len(X_train))] ) 

for i in categories_to_use:
  print('Number of train samples for '+i+': '+ str([X_train[y][1] for y in range(len(X_train))].count(i))) #checks whether train samples are equally divided
  print('Number of test samples for '+i+': '+ str([X_test[y][1] for y in range(len(X_test))].count(i))) #checks whether test samples are equally divided
  print('Number of validation samples for '+i+': '+ str([X_val[y][1] for y in range(len(X_val))].count(i))) #checks whether val samples are equally divided

print(f'Train length: {len(X_train)}')
print(f'Validation length: {len(X_val)}')
print(f'Test length: {len(X_test)}')

def load_and_preprocess_lstm(dataset, SAMPLE_SIZE = 30):
    IMG_SIZE = (216,128) 
    progress=0

    data = []
    labels = []
    for (path, label) in dataset:
        audio, sr = librosa.load(path)
        dur = librosa.get_duration(audio, sr = sr)
        sampleNum = int(dur / SAMPLE_SIZE)
        offset = (dur % SAMPLE_SIZE) / 2
        for i in range(sampleNum):
            audio, sr = librosa.load(path, offset= offset+i, duration=SAMPLE_SIZE)
            sample = librosa.feature.melspectrogram(audio, sr=sr)
            # print(sample.shape)
            sample = cv2.resize(sample, dsize=IMG_SIZE)
            sample = np.expand_dims(sample,-1)
            print(sample.shape)
            data += [(sample, label)]
            labels += [label]
        progress +=1
        print('\r Progress: '+str(round(100*progress/len(dataset))) + '%', end='')
    return data, labels

def retrieve_samples(sample_size, model_type):

    if model_type == 'cnn':
  
        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_cnn(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_cnn(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_cnn(X_val,sample_size)
        print('\n')

    elif model_type == 'lstm':

        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)      
        print('\n')

    elif model_type == "cnnlstm":

        print("\nLoading train samples")
        X_train_samples, train_labels = load_and_preprocess_lstm(X_train,sample_size)
        print("\nLoading test samples")
        X_test_samples, test_labels = load_and_preprocess_lstm(X_test,sample_size)
        print("\nLoading val samples")
        X_val_samples, val_labels = load_and_preprocess_lstm(X_val,sample_size)      
        print('\n')

    print("shape: " + str(X_train_samples[0][0].shape))
    print("number of training samples: "+ str(len(X_train_samples)))
    print("number of validation samples: "+ str(len(X_val_samples)))
    print("number of test samples: "+ str(len(X_test_samples)))


    return X_train_samples, X_test_samples, X_val_samples

def create_cnn_lstm_model(input_shape):

    model = Sequential()
    cnn = tensorflow.keras.applications.DenseNet169(include_top=True, weights=None, input_tensor=None, input_shape=input_shape, pooling=None, classes=2)
    # define LSTM model
    model.add(tensorflow.keras.layers.TimeDistributed(cnn, input_shape=input_shape))
    model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = True, input_shape = input_shape))
    model.add(LSTM(units = 512, dropout=0.5, recurrent_dropout=0.3, return_sequences = False))
    model.add(Dense(units=NUM_CLASSES, activation='sigmoid'))#Compile

    model.compile(loss=tensorflow.keras.losses.binary_crossentropy, optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model

def create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples):
    #Prepare samples to work for training the model
    labelizer = LabelEncoder()

    #prepare training data and labels
    x_train = np.array([x[0] for x in X_train_samples])
    y_train = np.array([x[1] for x in X_train_samples])
    y_train = labelizer.fit_transform(y_train) 
    y_train = to_categorical(y_train)

    #prepare validation data and labels
    x_val = np.array([x[0] for x in X_val_samples])
    y_val = np.array([x[1] for x in X_val_samples])
    y_val = labelizer.transform(y_val)
    y_val = to_categorical(y_val)

    #prepare test data and labels
    x_test = np.array([x[0] for x in X_test_samples])
    y_test = np.array([x[1] for x in X_test_samples])
    y_test = labelizer.transform(y_test)
    y_test = to_categorical(y_test)

    return x_train, y_train, x_val, y_val, x_test, y_test, labelizer


#Main loop for testing multiple sample sizes

#choose model type: 'cnn' or 'lstm'
model_type = 'cnnlstm'

n_epochs = 20
patience= 20
es = EarlyStopping(patience=20)
fragment_sizes = [5,10]
start = timeit.default_timer()

ModelData = pd.DataFrame(columns = ['Model Type','Fragment size (s)', 'Time to Compute (s)',  'Early Stopping epoch', 'Training accuracy', 'Validation accuracy', 'Test Accuracy']) #create a DataFrame for storing the results 

conf_matrix_data = []

for i in fragment_sizes:

    start_per_size = timeit.default_timer()

    print(f'\n---------- Model trained on fragments of size: {i} seconds ----------------')
    X_train_samples, X_test_samples, X_val_samples = retrieve_samples(i,model_type)
    x_train, y_train, x_val, y_val, x_test, y_test, labelizer = create_model_data_and_labels(X_train_samples, X_val_samples, X_test_samples)

    if model_type == 'cnn':
        model = create_cnn_model(X_train_samples[0][0].shape)
    elif model_type == 'lstm':
        model = create_lstm_model(X_train_samples[0][0].shape)
    elif model_type == 'cnnlstm':
        model = create_cnn_lstm_model(X_train_samples[0][0].shape)


    history = model.fit(x_train, y_train, 
              batch_size = 8, 
              epochs=n_epochs,
              verbose=1, 
              callbacks=[es],
              validation_data=(x_val, y_val))
    print('Finished training')


    early_stopping_epoch = len(history.history['accuracy'])
    training_accuracy = history.history['accuracy'][early_stopping_epoch-1-patience]
    validation_accuracy = history.history['val_accuracy'][early_stopping_epoch-1-patience]

    plot_data(history, i)

    predictions = model.predict(x_test)
    score = accuracy_score(labelizer.inverse_transform(y_test.argmax(axis=1)), labelizer.inverse_transform(predictions.argmax(axis=1)))

    print('Fragment size = ' + str(i) + ' seconds')
    print('Accuracy on test samples: ' + str(score))
    
    conf_matrix_data += [(predictions, y_test, i)]

    stop_per_size = timeit.default_timer()
    time_to_compute = round(stop_per_size - start_per_size)

    print ('Time to compute: '+str(time_to_compute))

    ModelData.loc[len(ModelData)] = [model_type, i, time_to_compute, early_stopping_epoch, training_accuracy, validation_accuracy, score] #store particular settings configuration, early stoppping epoch and accuracies in dataframe

stop = timeit.default_timer()
print ('\ntime to compute: '+str(stop-start))

Answer 1

我相信 input_shape 是 (128, 216, 1)

這里的問題是您沒有時間軸來時間分布您的 CNN (DenseNet169) 層。

在這一步 -

tensorflow.keras.layers.TimeDistributed(cnn, input_shape=(128,216,1)))

您將 128 維軸作為時間軸傳遞。 這意味着每個 CNN (DenseNet169) 的輸入形狀為(216,1) ，這不是圖像，因此會引發錯誤，因為它期望 3D 張量（圖像）而不是 2D 張量。

您的輸入形狀需要是 4D 張量，例如 - (10, 128, 216, 1) ，因此10成為時間軸（用於時間分布），並且(128, 216, 1)成為圖像輸入CNN (DenseNet169)。

具有不規則張量和時間分布層的解決方案

IIUC，您的數據包含 n 個音頻文件，每個文件包含可變數量的梅爾譜圖圖像。

您需要使用tf.raggedtensors才能使用可變張量形狀作為 model 的輸入
這需要明確定義輸入層，您在其中設置ragged=True
這允許您將每個音頻文件作為單個樣本傳遞，其中包含可變圖像，每個圖像都將是時間分布的。
在定義 model 時，您必須使用None作為時間分布軸形狀

1. 創建一個虛擬數據集

讓我們從一個示例數據集開始 -

import tensorflow as tf
from tensorflow.keras import layers, Model, utils, applications


#Assuming there are 5 audio files
num_audio = 5

data = []

#Create a random number of mel-spectrograms for each audio file
for i in range(num_audio):
    n_images = np.random.randint(4,10)
    data.append(np.random.random((n_images,128,216,1)))
    
print([i.shape for i in data])

[(5, 128, 216, 1), 
 (5, 128, 216, 1), 
 (9, 128, 216, 1), 
 (6, 128, 216, 1), 
 (4, 128, 216, 1)]

因此，您的數據應該看起來像這樣。 在這里，我有一個包含 5 個音頻文件的虛擬數據集，第一個有 5 個形狀(128,216,1)的圖像，而最后一個有 4 個相同形狀的圖像。

2. 將它們轉換為不規則張量

接下來，讓我們轉換並存儲這些參差不齊的張量。 參差不齊的張量允許存儲可變長度的對象，在這種情況下，可以存儲可變數量的圖像。 在此處閱讀有關它們的更多信息。

#Convert each set of images (for each audio) to tensors and then a ragged tensor
tensors = [tensorflow.convert_to_tensor(i) for i in data]
X_train = tensorflow.ragged.stack(tensors).to_tensor()

#Creating dummy y_train, one for each audio files
y_train = tensorflow.convert_to_tensor(np.random.randint(0,2,(5,2)))

3.創建model

我正在使用functional API ，因為我發現它更具可讀性並且與顯式輸入層配合得更好，但是您也可以在Sequential API中使用輸入層。 隨意將其轉換為您的偏好。

請注意，我使用(None,128,216,1)作為輸入形狀。 這將創建 5 個通道（第一個用於批次的隱式通道）作為 - (Batch, audio_files, h, w, channels)

我有一個虛擬 LSTM 層來展示架構的工作原理，隨意堆疊更多層。 另外，請注意，您的DenseNet169僅返回 2 個特征。 因此，您的TimeDistributed層返回(None, None, 2)形狀的張量，其中第一個None是音頻文件的數量，第二個None是圖像的數量（時間軸）。 因此，請相應地選擇下一層，因為 512 個 LSTM 單元可能太多了:)

#Create model
inp = layers.Input((None,128,216,1), ragged=True)

cnn = tensorflow.keras.applications.DenseNet169(include_top=True, 
                                                weights=None, 
                                                input_tensor=None, 
                                                input_shape=(128,216,1), #<----- input shape for cnn is just the image
                                                pooling=None, classes=2)


#Feel free to modify these layers!
x = layers.TimeDistributed(cnn)(inp)
x = layers.LSTM(8)(x)
out = layers.Dense(2)(x)

model = Model(inp, out)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics='accuracy')

utils.plot_model(model, show_shapes=True, show_layer_names=False)

4. 訓練！

下一步就是簡單地訓練。 隨意添加您自己的參數。

model.fit(X_train, y_train, epochs=2)

Epoch 1/2
WARNING:tensorflow:5 out of the last 5 calls to <function Model.make_train_function.<locals>.train_function at 0x7f8e55b4fe50> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/1 [==============================] - 37s 37s/step - loss: 3.4057 - accuracy: 0.4000
Epoch 2/2
1/1 [==============================] - 16s 16s/step - loss: 3.3544 - accuracy: 0.4000

希望有幫助。

如何使用 Keras 實現 CNN-LSTM

問題描述

1 個解決方案

解決方案1
2 已采納 2021-03-09 11:33:15

具有不規則張量和時間分布層的解決方案

1. 創建一個虛擬數據集

2. 將它們轉換為不規則張量

3.創建model

4. 訓練！

如何使用 Keras 實現 CNN-LSTM

問題描述

1 個解決方案

解決方案1 2 已采納 2021-03-09 11:33:15

具有不規則張量和時間分布層的解決方案

1. 創建一個虛擬數據集

2. 將它們轉換為不規則張量

3.創建model

4. 訓練！

解決方案1
2 已采納 2021-03-09 11:33:15