I am working on a neural net for time series classification with metadata.
My problem is that I have a data frame with different datatypes, which all need to be preprocessed. In doing so I generate a lot of different input layers with individual names. But in my tf.dataset
the data frame is saved as a whole and thus the keras.fit
function expects one input instead of many.
Do you have any idea as to how I can get the model.fit
to accept the data I have and maybe even get it all into a nicer software architecture?
import pandas as pd
import tensorflow as tf
def get_normalization_layer(
dataset, input_name, all_inputs, encoded_features, axis=None
):
normalization_layer = tf.keras.layers.Normalization(axis=axis)
feature_ds = dataset.map(lambda x, y: x[input_name])
normalization_layer.adapt(feature_ds)
input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.float32)
normalization_layer = normalization_layer(input_layer)
all_inputs.append(input_layer)
encoded_features.append(normalization_layer)
def get_category_encoding_layer(
dataset,
input_name,
dtype,
all_inputs,
encoded_features,
vocabulary=None,
max_tokens=None,
):
if dtype == "string":
index = tf.keras.layers.StringLookup(
max_tokens=max_tokens, vocabulary=vocabulary
)
input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.string)
elif dtype == "int":
index = tf.keras.layers.IntegerLookup(
max_tokens=max_tokens, vocabulary=vocabulary
)
input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.int64)
if vocabulary is None:
feature_ds = dataset.map(lambda x, y: x[input_name])
index.adapt(feature_ds)
encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
encoder = encoder(index(input_layer))
all_inputs.append(input_layer)
encoded_features.append(encoder)
################################################################################
measurement_data = np.random.rand(150, 11, 2400)
meta_data = pd.DataFrame(
data={
"Product": ["A", "B", "C", "D"],
"Length": [23, 22, 21, 24],
"Width": [11.2, 23.4, 57.35, 0],
"Labels": [0, 0, 1, 0],
}
)
################################################################################
dataframe = meta_data.copy()
labels = dataframe.pop("Labels")
dataframe = {key: np.array(value)[:, tf.newaxis] for key, value in dataframe.items()}
dataset_measurement = tf.data.Dataset.from_tensor_slices(measurement_data)
dataset_meta = tf.data.Dataset.from_tensor_slices((dataframe, labels))
################################################################################
all_inputs = []
encoded_features = []
normalization_layer = tf.keras.layers.Normalization(axis=1)
feature_ds = dataset_measurement.map(lambda x: x)
normalization_layer.adapt(feature_ds)
input_layer = tf.keras.Input(
shape=list(feature_ds)[0].shape,
name="measurement_input",
dtype=tf.float32,
)
normalization_layer = normalization_layer(input_layer)
all_inputs.append(input_layer)
encoded_features.append(normalization_layer)
get_category_encoding_layer(
dataset_meta, "Product", "string", all_inputs, encoded_features
)
get_category_encoding_layer(dataset_meta, "Length", "int", all_inputs, encoded_features)
get_normalization_layer(dataset_meta, "Width", all_inputs, encoded_features)
dataset = tf.data.Dataset.zip((dataset_measurement, dataset_meta))
dataset = dataset.map(
lambda x, y: ({"measurement_input": x, "meta_input": y[:-1]}, y[-1])
)
dataset = dataset.batch(64)
dataset = dataset.prefetch(64)
################################################################################
conv1 = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(
encoded_features[0]
)
conv1 = tf.keras.layers.BatchNormalization()(conv1)
conv1 = tf.keras.layers.ReLU()(conv1)
gap = tf.keras.layers.GlobalAveragePooling1D()(conv1)
all_features = tf.keras.layers.concatenate(encoded_features[1:])
x1 = tf.keras.layers.Dense(128, activation="relu")(all_features)
x1 = tf.keras.layers.Dropout(0.5)(x1)
meta_and_measurement = tf.keras.layers.concatenate([gap, x1])
f1 = tf.keras.layers.Dense(128, activation="relu")(meta_and_measurement)
f1 = tf.keras.layers.Dropout(0.5)(f1)
output_layer = tf.keras.layers.Dense(2, activation="softmax")(f1)
################################################################################
model = tf.keras.models.Model(inputs=all_inputs, outputs=output_layer)
model.compile(
optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["sparse_categorical_accuracy"],
)
# ValueError: Missing data for input "Product".
# You passed a data dictionary with keys ['measurement_input', 'meta_input'].
# Expected the following keys: ['measurement_input', 'Product', 'Length', 'Width']
history = model.fit(
dataset,
epochs=50,
verbose=1,
)
I found an answer, but it feels rather hacky and it would be lovely if someone had a better idea. You can use a function to strip and recombine the dataset in a way that tensorflow likes:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
def get_normalization_layer(
dataset, input_name, all_inputs, encoded_features, axis=None
):
normalization_layer = tf.keras.layers.Normalization(axis=axis)
feature_ds = dataset.map(lambda x, y: x[input_name])
normalization_layer.adapt(feature_ds)
input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.float32)
normalization_layer = normalization_layer(input_layer)
all_inputs.append(input_layer)
encoded_features.append(normalization_layer)
def get_category_encoding_layer(
dataset,
input_name,
dtype,
all_inputs,
encoded_features,
vocabulary=None,
max_tokens=None,
):
if dtype == "string":
index = tf.keras.layers.StringLookup(
max_tokens=max_tokens, vocabulary=vocabulary
)
input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.string)
elif dtype == "int":
index = tf.keras.layers.IntegerLookup(
max_tokens=max_tokens, vocabulary=vocabulary
)
input_layer = tf.keras.Input(shape=(1,), name=input_name, dtype=tf.int64)
if vocabulary is None:
feature_ds = dataset.map(lambda x, y: x[input_name])
index.adapt(feature_ds)
encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
encoder = encoder(index(input_layer))
all_inputs.append(input_layer)
encoded_features.append(encoder)
################################################################################
measurement_data = np.random.rand(150, 11, 2400)
meta_data = pd.DataFrame(
data={
"Product": ["A", "A", "C", "D"],
"Length": [23, 22, 21, 24],
"Width": [11.2, 23.4, 57.35, 0],
"Labels": [0, 0, 1, 0],
}
)
################################################################################
dataframe = meta_data.copy()
labels = dataframe.pop("Labels")
dataframe = {key: np.array(value)[:, tf.newaxis] for key, value in dataframe.items()}
dataset_measurement = tf.data.Dataset.from_tensor_slices(measurement_data)
dataset_meta = tf.data.Dataset.from_tensor_slices((dataframe, labels))
################################################################################
all_inputs = []
encoded_features = []
normalization_layer = tf.keras.layers.Normalization(axis=1)
feature_ds = dataset_measurement.map(lambda x: x)
normalization_layer.adapt(feature_ds)
input_layer = tf.keras.Input(
shape=list(feature_ds)[0].shape,
name="measurement_input",
dtype=tf.float32,
)
normalization_layer = normalization_layer(input_layer)
all_inputs.append(input_layer)
encoded_features.append(normalization_layer)
get_category_encoding_layer(
dataset_meta, "Product", "string", all_inputs, encoded_features
)
get_category_encoding_layer(dataset_meta, "Length", "int", all_inputs, encoded_features)
get_normalization_layer(dataset_meta, "Width", all_inputs, encoded_features)
dataset = tf.data.Dataset.zip((dataset_measurement, dataset_meta))
def map_func(x, y):
meta_names = [name for name in y[0]]
meta_values = list(y[0].values())
meta = dict(zip(meta_names, meta_values))
dictinary = {"measurement_input": x}
dictinary.update(meta)
result = (dictinary, y[-1])
return result
data_set = dataset.map(map_func)
data_set = data_set.batch(64)
data_set = data_set.prefetch(64)
################################################################################
conv1 = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(
encoded_features[0]
)
conv1 = tf.keras.layers.BatchNormalization()(conv1)
conv1 = tf.keras.layers.ReLU()(conv1)
gap = tf.keras.layers.GlobalAveragePooling1D()(conv1)
all_features = tf.keras.layers.concatenate(encoded_features[1:])
x1 = tf.keras.layers.Dense(128, activation="relu")(all_features)
x1 = tf.keras.layers.Dropout(0.5)(x1)
meta_and_measurement = tf.keras.layers.concatenate([gap, x1])
f1 = tf.keras.layers.Dense(128, activation="relu")(meta_and_measurement)
f1 = tf.keras.layers.Dropout(0.5)(f1)
output_layer = tf.keras.layers.Dense(2, activation="softmax")(f1)
################################################################################
model = tf.keras.models.Model(inputs=all_inputs, outputs=output_layer)
model.compile(
optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["sparse_categorical_accuracy"],
)
tf.keras.utils.plot_model(
model,
to_file=os.path.join(os.getcwd(), "model.png"),
show_shapes=True,
show_dtype=True,
)
history = model.fit(
data_set,
epochs=50,
verbose=1,
)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.