简体   繁体   中英

Tensorflow time series classification with metadata: preprocessing layer and dataset integration

I am working on a neural net for time series classification with metadata.

My problem is that I have a data frame with different datatypes, which all need to be preprocessed. In doing so I generate a lot of different input layers with individual names. But in my tf.dataset the data frame is saved as a whole and thus the keras.fit function expects one input instead of many.

Do you have any idea as to how I can get the model.fit to accept the data I have and maybe even get it all into a nicer software architecture?

import pandas as pd
import tensorflow as tf


def get_normalization_layer(
    dataset, input_name, all_inputs, encoded_features, axis=None
):
    normalization_layer = tf.keras.layers.Normalization(axis=axis)
    feature_ds = dataset.map(lambda x, y: x[input_name])
    normalization_layer.adapt(feature_ds)
    input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.float32)
    normalization_layer = normalization_layer(input_layer)
    all_inputs.append(input_layer)
    encoded_features.append(normalization_layer)


def get_category_encoding_layer(
    dataset,
    input_name,
    dtype,
    all_inputs,
    encoded_features,
    vocabulary=None,
    max_tokens=None,
):
    if dtype == "string":
        index = tf.keras.layers.StringLookup(
            max_tokens=max_tokens, vocabulary=vocabulary
        )
        input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.string)
    elif dtype == "int":
        index = tf.keras.layers.IntegerLookup(
            max_tokens=max_tokens, vocabulary=vocabulary
        )
        input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.int64)
    if vocabulary is None:
        feature_ds = dataset.map(lambda x, y: x[input_name])
        index.adapt(feature_ds)
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    encoder = encoder(index(input_layer))
    all_inputs.append(input_layer)
    encoded_features.append(encoder)


################################################################################

measurement_data = np.random.rand(150, 11, 2400)

meta_data = pd.DataFrame(
    data={
        "Product": ["A", "B", "C", "D"],
        "Length": [23, 22, 21, 24],
        "Width": [11.2, 23.4, 57.35, 0],
        "Labels": [0, 0, 1, 0],
    }
)

################################################################################

dataframe = meta_data.copy()
labels = dataframe.pop("Labels")
dataframe = {key: np.array(value)[:, tf.newaxis] for key, value in dataframe.items()}

dataset_measurement = tf.data.Dataset.from_tensor_slices(measurement_data)
dataset_meta = tf.data.Dataset.from_tensor_slices((dataframe, labels))

################################################################################

all_inputs = []
encoded_features = []

normalization_layer = tf.keras.layers.Normalization(axis=1)
feature_ds = dataset_measurement.map(lambda x: x)
normalization_layer.adapt(feature_ds)
input_layer = tf.keras.Input(
    shape=list(feature_ds)[0].shape,
    name="measurement_input",
    dtype=tf.float32,
)
normalization_layer = normalization_layer(input_layer)
all_inputs.append(input_layer)
encoded_features.append(normalization_layer)

get_category_encoding_layer(
    dataset_meta, "Product", "string", all_inputs, encoded_features
)
get_category_encoding_layer(dataset_meta, "Length", "int", all_inputs, encoded_features)
get_normalization_layer(dataset_meta, "Width", all_inputs, encoded_features)

dataset = tf.data.Dataset.zip((dataset_measurement, dataset_meta))
dataset = dataset.map(
    lambda x, y: ({"measurement_input": x, "meta_input": y[:-1]}, y[-1])
)

dataset = dataset.batch(64)
dataset = dataset.prefetch(64)

################################################################################

conv1 = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(
    encoded_features[0]
)
conv1 = tf.keras.layers.BatchNormalization()(conv1)
conv1 = tf.keras.layers.ReLU()(conv1)
gap = tf.keras.layers.GlobalAveragePooling1D()(conv1)

all_features = tf.keras.layers.concatenate(encoded_features[1:])
x1 = tf.keras.layers.Dense(128, activation="relu")(all_features)
x1 = tf.keras.layers.Dropout(0.5)(x1)

meta_and_measurement = tf.keras.layers.concatenate([gap, x1])
f1 = tf.keras.layers.Dense(128, activation="relu")(meta_and_measurement)
f1 = tf.keras.layers.Dropout(0.5)(f1)

output_layer = tf.keras.layers.Dense(2, activation="softmax")(f1)

################################################################################

model = tf.keras.models.Model(inputs=all_inputs, outputs=output_layer)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["sparse_categorical_accuracy"],
)

# ValueError: Missing data for input "Product".
# You passed a data dictionary with keys ['measurement_input', 'meta_input'].
# Expected the following keys: ['measurement_input', 'Product', 'Length', 'Width']
history = model.fit(
    dataset,
    epochs=50,
    verbose=1,
)

I found an answer, but it feels rather hacky and it would be lovely if someone had a better idea. You can use a function to strip and recombine the dataset in a way that tensorflow likes:

import numpy as np
import pandas as pd
import tensorflow as tf
import os


def get_normalization_layer(
    dataset, input_name, all_inputs, encoded_features, axis=None
):
    normalization_layer = tf.keras.layers.Normalization(axis=axis)
    feature_ds = dataset.map(lambda x, y: x[input_name])
    normalization_layer.adapt(feature_ds)
    input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.float32)
    normalization_layer = normalization_layer(input_layer)
    all_inputs.append(input_layer)
    encoded_features.append(normalization_layer)


def get_category_encoding_layer(
    dataset,
    input_name,
    dtype,
    all_inputs,
    encoded_features,
    vocabulary=None,
    max_tokens=None,
):
    if dtype == "string":
        index = tf.keras.layers.StringLookup(
            max_tokens=max_tokens, vocabulary=vocabulary
        )
        input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.string)
    elif dtype == "int":
        index = tf.keras.layers.IntegerLookup(
            max_tokens=max_tokens, vocabulary=vocabulary
        )
        input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.int64)
    if vocabulary is None:
        feature_ds = dataset.map(lambda x, y: x[input_name])
        index.adapt(feature_ds)
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    encoder = encoder(index(input_layer))
    all_inputs.append(input_layer)
    encoded_features.append(encoder)


################################################################################

measurement_data = np.random.rand(150, 11, 2400)

meta_data = pd.DataFrame(
    data={
        "Product": ["A", "A", "C", "D"],
        "Length": [23, 22, 21, 24],
        "Width": [11.2, 23.4, 57.35, 0],
        "Labels": [0, 0, 1, 0],
    }
)

################################################################################

dataframe = meta_data.copy()
labels = dataframe.pop("Labels")
dataframe = {key: np.array(value)[:, tf.newaxis] for key, value in dataframe.items()}

dataset_measurement = tf.data.Dataset.from_tensor_slices(measurement_data)
dataset_meta = tf.data.Dataset.from_tensor_slices((dataframe, labels))

################################################################################

all_inputs = []
encoded_features = []

normalization_layer = tf.keras.layers.Normalization(axis=1)
feature_ds = dataset_measurement.map(lambda x: x)
normalization_layer.adapt(feature_ds)
input_layer = tf.keras.Input(
    shape=list(feature_ds)[0].shape,
    name="measurement_input",
    dtype=tf.float32,
)
normalization_layer = normalization_layer(input_layer)
all_inputs.append(input_layer)
encoded_features.append(normalization_layer)

get_category_encoding_layer(
    dataset_meta, "Product", "string", all_inputs, encoded_features
)
get_category_encoding_layer(dataset_meta, "Length", "int", all_inputs, encoded_features)
get_normalization_layer(dataset_meta, "Width", all_inputs, encoded_features)

dataset = tf.data.Dataset.zip((dataset_measurement, dataset_meta))


def map_func(x, y):
    meta_names = [name for name in y[0]]
    meta_values = list(y[0].values())
    meta = dict(zip(meta_names, meta_values))
    dictinary = {"measurement_input": x}
    dictinary.update(meta)
    result = (dictinary, y[-1])
    return result


data_set = dataset.map(map_func)

data_set = data_set.batch(64)
data_set = data_set.prefetch(64)

################################################################################

conv1 = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(
    encoded_features[0]
)
conv1 = tf.keras.layers.BatchNormalization()(conv1)
conv1 = tf.keras.layers.ReLU()(conv1)
gap = tf.keras.layers.GlobalAveragePooling1D()(conv1)

all_features = tf.keras.layers.concatenate(encoded_features[1:])
x1 = tf.keras.layers.Dense(128, activation="relu")(all_features)
x1 = tf.keras.layers.Dropout(0.5)(x1)

meta_and_measurement = tf.keras.layers.concatenate([gap, x1])
f1 = tf.keras.layers.Dense(128, activation="relu")(meta_and_measurement)
f1 = tf.keras.layers.Dropout(0.5)(f1)

output_layer = tf.keras.layers.Dense(2, activation="softmax")(f1)

################################################################################

model = tf.keras.models.Model(inputs=all_inputs, outputs=output_layer)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["sparse_categorical_accuracy"],
)

tf.keras.utils.plot_model(
    model,
    to_file=os.path.join(os.getcwd(), "model.png"),
    show_shapes=True,
    show_dtype=True,
)

history = model.fit(
    data_set,
    epochs=50,
    verbose=1,
)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM