Nan Loss when training Deep neural Recommender model using tensorflow

Question

I am trying to follow tensorflow documentation and applying same technique to one of toy dataset.

During training I am getting all loss as Nan. I have tried to debug the same using Debugger V2 and I could see that tf.keras.layers.GlobalAveragePooling1D is giving Nan due to division by 0, which is causing all values to be Nan during backpropagation. But what is not clear from the debugger V2 GUI why the sum is becoming 0. I did try to reduce the number of features and the size of the dataset, but each of this activity is giving me new error (probably I shall start a separate question thread for each issues at a later point ).

Below is the code for reference. I am providing the dataset as well here . I had tried below code on Google Colab.

import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

tf.debugging.experimental.enable_dump_debug_info(
    "./tfdbg2_logdir",
    tensor_debug_mode="FULL_HEALTH",
    circular_buffer_size=-1)

!pip install -q tensorflow-recommenders
import tensorflow_recommenders as tfrs

Preparing Data

ds=pd.read_csv('train_recom.csv')
ds['year'].replace(0,1,inplace=True)
ds_song=ds.groupby(['song_id','title','release','artist_name','year']).size().reset_index().rename(columns={0:'count'})
ds_song.to_csv('songs_details.csv')
ds.to_csv('train_recom_transformed.csv')

Reading data to tensorflow dataset

ratings = tf.data.experimental.make_csv_dataset(
    "./train_recom_transformed.csv",
    batch_size=5,
    select_columns=['user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
       'year'],
    header=True,
    num_epochs=1,
    ignore_errors=False,)
songs = tf.data.experimental.make_csv_dataset(
    "./songs_details.csv",
    batch_size=128,
    select_columns=['song_id','title','release','artist_name','year'],
    num_epochs=1,
    ignore_errors=True,)
ratings = ratings.unbatch().map(lambda x: {
    "song_id": x["song_id"],
    "user_id": x["user_id"],
    "release" : x["release"],
    "artist_name" : x["artist_name"],
    "title" : x["title"],
    "year" : x["year"],
    "listen_count": x["listen_count"]
})
songs = songs.unbatch().map(lambda x: x["song_id"])

Preparing train and test dataset

tf.random.set_seed(42)
shuffled = ratings.shuffle(16000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(12000)
test = shuffled.skip(12000).take(4000)
cached_train = train.shuffle(100_000).batch(1200).cache()
cached_test = test.batch(400).cache()

title = songs.batch(1000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])
unique_song_titles = np.unique(np.concatenate(list(title)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
year_data=np.concatenate(list(ratings.map(lambda x: x['year']).batch(4000)))

User model class

class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        max_tokens = 1_000_000

        embedding_dimension = 32
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
          ])

        self.release_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)
        
        self.release_text_embedding = tf.keras.Sequential([
          self.release_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True,input_length=144),
          tf.keras.layers.GlobalAveragePooling1D(),
        ])

        self.release_vectorizer.adapt(np.concatenate(list(ratings.map(lambda x: x['release']).batch(4000))))

        self.artist_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)
        self.artist_text_embedding = tf.keras.Sequential([
          self.artist_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          tf.keras.layers.GlobalAveragePooling1D(),
        ])
        self.artist_vectorizer.adapt(np.concatenate(list(ratings.map(lambda x: x['artist_name']).batch(4000))))
        
        self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)
        self.title_text_embedding = tf.keras.Sequential([
          self.title_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          tf.keras.layers.GlobalAveragePooling1D(),
        ])
        self.title_vectorizer.adapt(np.concatenate(list(ratings.map(lambda x: x['title']).batch(4000))))
        
        self.year_embedding = tf.keras.Sequential([
              tf.keras.layers.Embedding(len(year_data) + 1, 32),
            ])

    def call(self, inputs):
      return tf.concat([
          self.user_embedding(inputs['user_id']),
          self.release_text_embedding(inputs['release'])
          ,
          self.year_embedding(inputs['year']), 
          self.artist_text_embedding(inputs['artist_name']),
          self.title_text_embedding(inputs['title']),
             ], axis=1)

Item model

class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        embedding_dimension = 32

        ## embed title from unique_song_titles
        self.title_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_song_titles, mask_token=None),
        tf.keras.layers.Embedding(len(unique_song_titles) + 1, embedding_dimension)
      ])

    def call(self, inputs):
      return self.title_embedding(inputs)

Query model . Creating Deep model

class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = UserModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

Creating deep model for the Item model

class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = ItemModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

Combining both query and candidate model

class SongModel(tfrs.models.Model):

    def __init__(self, layer_sizes):
        super().__init__()
        self.query_model = QueryModel(layer_sizes)
        self.candidate_model = CandidateModel(layer_sizes)
        self.task = tfrs.tasks.Retrieval(
          metrics=tfrs.metrics.FactorizedTopK(
              candidates=songs.batch(128).map(self.candidate_model),
          ),
      )

    def compute_loss(self, features, training=False):
        print('type of feature ----',type(features))

        query_embeddings = self.query_model({
            "user_id": features["user_id"]
            ,
                "release" : features["release"]
                ,
                "artist_name" : features["artist_name"],
                "title": features["title"],
                "year" : features["year"],
        })

        item_embeddings = self.candidate_model(features["song_id"])

        return self.task(query_embeddings, item_embeddings)

training the model

model = SongModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model_hist = model.fit(cached_train, epochs=9)

Below id the outout that I got

WARNING:tensorflow:Failed to read source code from path: /content/<ipython-input-26-fdc864fc30cf>. Reason: Source path neither exists nor can be loaded as a .par file: /content/<ipython-input-26-fdc864fc30cf>
WARNING:tensorflow:Failed to read source code from path: /content/<ipython-input-25-e3009db55439>. Reason: Source path neither exists nor can be loaded as a .par file: /content/<ipython-input-25-e3009db55439>
Epoch 1/9
type of feature ---- <class 'dict'>
WARNING:tensorflow:Model was constructed with shape (None, None) for input KerasTensor(type_spec=TensorSpec(shape=(None, None), dtype=tf.float32, name='embedding_10_input'), name='embedding_10_input', description="created by layer 'embedding_10_input'"), but it was called on an input with incompatible shape (None,).
type of feature ---- <class 'dict'>
WARNING:tensorflow:Model was constructed with shape (None, None) for input KerasTensor(type_spec=TensorSpec(shape=(None, None), dtype=tf.float32, name='embedding_10_input'), name='embedding_10_input', description="created by layer 'embedding_10_input'"), but it was called on an input with incompatible shape (None,).
10/10 [==============================] - 63s 1s/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0022 - factorized_top_k/top_10_categorical_accuracy: 0.0033 - factorized_top_k/top_50_categorical_accuracy: 0.0073 - factorized_top_k/top_100_categorical_accuracy: 0.0103 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 2/9
10/10 [==============================] - 9s 945ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 3/9
10/10 [==============================] - 10s 953ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 4/9
10/10 [==============================] - 9s 948ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 5/9
10/10 [==============================] - 10s 966ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 6/9
10/10 [==============================] - 10s 955ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 7/9
10/10 [==============================] - 10s 955ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 8/9
10/10 [==============================] - 10s 958ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 9/9
10/10 [==============================] - 10s 971ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan

Answer 1

I got a similar error when using tfrs on a custom dataset. And it turns out that I had some none print characters and sysmbols in the data. I simply searched and removed the symbols (manually, some regex) and i also limit the text columns in the dataframe to printable characters only.

from string import printable as pt

allowed_set = set(pt)
df[col] = df[col].apply(lambda x:  ''.join([' ' if  s not in  allowed_set else s for s in x]))

I hope it helps.

Answer 2

Issue was that when we replaced special characters with blank space then for one record whole data became NULL ( for release field). Conclusively it was data issue rather than code issue. We then added below two lines to deal with such case ds.replace(r'^\s*$', 'None', regex=True) . Below is the whole code with all changes made

import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

!pip install -q tensorflow-recommenders
import tensorflow_recommenders as tfrs  

ds=pd.read_csv('train_recom.csv')

print(ds['release'].isnull().sum())
print(ds['title'].isnull().sum())
print(ds['artist_name'].isnull().sum())
print(ds['year'].isnull().sum())
print(ds.isna().any(axis=None))
print(any(ds[c].hasnans for c in ds))
for c in ds:
  if ds[c].hasnans:
    print(c)

ds['year'].replace(0,1,inplace=True)
ds.release.replace({r'[^a-zA-Z0-9 ]+':''}, regex=True, inplace=True)
ds.artist_name.replace({r'[^a-zA-Z0-9 ]+':''}, regex=True, inplace=True)
ds.title.replace({r'[^a-zA-Z0-9 ]+':''}, regex=True, inplace=True)
ds2=ds.replace(r'^\s*$', np.nan, regex=True)
ds2['release']=ds2['release'].fillna('None')
ds=ds2
ds_song=ds.groupby(['song_id','title','release','artist_name','year']).size().reset_index().rename(columns={0:'count'})

ds_song.to_csv('songs_details.csv')
ds.to_csv('train_recom_transformed.csv')

ratings = tf.data.experimental.make_csv_dataset(
    "./train_recom_transformed.csv",
    batch_size=5,
    select_columns=['user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
       'year'],
    header=True,
    num_epochs=1,
    ignore_errors=False,)
songs = tf.data.experimental.make_csv_dataset(
    "./songs_details.csv",
    batch_size=128,
    select_columns=['song_id','title','release','artist_name','year'],
    num_epochs=1,
    ignore_errors=True,)
ratings = ratings.unbatch().map(lambda x: {
    "song_id": x["song_id"],
    "user_id": x["user_id"],
    "release" : x["release"],
    "artist_name" : x["artist_name"],
    "title" : x["title"],
    "year" : x["year"],
    "listen_count": x["listen_count"]
})
songs = songs.unbatch().map(lambda x: {
    "song_id":x["song_id"],
    "release":x["release"],
    "artist_name":x["artist_name"],
    "title":x["title"],
    "year":x["year"],
}) 

tf.random.set_seed(42)
shuffled = ratings.shuffle(16000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(12000)
test = shuffled.skip(12000).take(4000)
cached_train = train.shuffle(100_000).batch(1200).cache()
cached_test = test.batch(400).cache()

title = songs.batch(1000).map(lambda x: x["title"])
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])
unique_song_titles = np.unique(np.concatenate(list(title)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
year_data=list(songs.map(lambda x: x['year']))

class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        max_tokens = 1_000_000

        embedding_dimension = 32
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
          ])



    def call(self, inputs):
      return self.user_embedding(inputs['user_id'])

class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        max_tokens = 10_000_00

        embedding_dimension = 32

        ## embed title from unique_song_titles
        self.title_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_song_titles, mask_token=None),
        tf.keras.layers.Embedding(len(unique_song_titles) + 1, embedding_dimension)
      ])

        self.release_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)
        
        self.release_text_embedding = tf.keras.Sequential([
          self.release_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True,input_length=144),
          tf.keras.layers.GlobalAveragePooling1D(),
        ])

        self.release_vectorizer.adapt(songs.map(lambda x: x['release']))

        self.artist_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)
        self.artist_text_embedding = tf.keras.Sequential([
          self.artist_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          tf.keras.layers.GlobalAveragePooling1D(),
        ])

        self.artist_vectorizer.adapt(songs.map(lambda x: x['artist_name']))
        
        self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)
        self.title_text_embedding = tf.keras.Sequential([
          self.title_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          tf.keras.layers.GlobalAveragePooling1D(),
        ])
        self.title_vectorizer.adapt(songs.map(lambda x: x['title']))
        
        self.year_embedding = tf.keras.Sequential([
              tf.keras.layers.Embedding(len(year_data) + 1, 32),
              # tf.keras.layers.Embedding(2501, 32),
            ])
        
    def call(self, inputs):
      # return self.title_embedding(inputs['title'])
      return tf.concat([
    self.title_embedding(inputs['title']),
    self.release_text_embedding(inputs['release'])
    ,
    self.year_embedding(inputs['year']), 
    self.artist_text_embedding(inputs['artist_name']),
    self.title_text_embedding(inputs['title']),
        ], axis=1)

class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = UserModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = ItemModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

class SongModel(tfrs.models.Model):

    def __init__(self, layer_sizes):
        super().__init__()
        self.query_model = QueryModel(layer_sizes)
        self.candidate_model = CandidateModel(layer_sizes)
        self.task = tfrs.tasks.Retrieval(
          metrics=tfrs.metrics.FactorizedTopK(
              candidates=songs.batch(128).map(self.candidate_model),
          ),
      )

    def compute_loss(self, features, training=False):
        print('type of feature ----',type(features))

        query_embeddings = self.query_model({
            "user_id": features["user_id"]
            ,
        })

        item_embeddings = self.candidate_model({            
            "song_id": features["song_id"],
                "title" : features["title"],
                 "release" : features["release"]
                ,
                "artist_name" : features["artist_name"],
                "title": features["title"],
                "year" : features["year"],

        })

        return self.task(query_embeddings, item_embeddings)

model = SongModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model_hist = model.fit(cached_train, epochs=9)

Nan Loss when training Deep neural Recommender model using tensorflow

Question

2 answers

solution1
1 ACCPTED 2022-05-31 11:16:31

solution2
0 2022-06-06 16:14:49

Nan Loss when training Deep neural Recommender model using tensorflow

Question

2 answers

solution1 1 ACCPTED 2022-05-31 11:16:31

solution2 0 2022-06-06 16:14:49

solution1
1 ACCPTED 2022-05-31 11:16:31

solution2
0 2022-06-06 16:14:49