简体   繁体   中英

How to fix "zero-size array to reduction operation maximum which has no identity" in keras pad_sequences

Recently faced with such a problem: ValueError: array of size zero up to the maximum of the cast operation, having no identity.

import tensorflow as tf
import unicodedata
import string
import numpy as np
import re
import matplotlib.pyplot as plt
keras = tf.keras
class Lang(object):
    def __init__(self, name):
        self.name = name
        self.word2int = {} #maps words to integers
        self.word2count = {} #maps words to their total number in the corpus
        self.int2word = {0 : "SOS", 1 : "EOS"} #maps integers to tokens (just the opposite of word2int but has some initial values. EOS means End of Sentence and it's a token used to indicate the end of a sentence. Every sentence is going to have an EOS token. SOS means Start of Sentence and is used to indicate the start of a sentence.)
        self.n_words = 2 #Intial number of tokens (EOS and SOS)
        
    def addWord(self, word):
        if word not in self.word2int:
            self.word2int[word] = self.n_words
            self.word2count[word] = 1
            self.int2word[self.n_words] = word
            self.n_words += 1
            
        else:
            self.word2count[word] += 1
    
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
def unicodeToAscii(s):
    return "".join(c for c in unicodedata.normalize("NFD", s) \
                   if unicodedata.category(c) != "Mn")
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([!.?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z?.!]+", " ", s)
    return s
def load_dataset():
    with open("en_fr.txt",'r') as f:
        lines = f.readlines()
    
    pairs = [[normalizeString(pair) for pair in 
              line.strip().split('\t')] for line in lines]
    return pairs
def sentencetoIndexes(sentence, lang):
    indexes = [lang.word2int[word] for word in sentence.split()]
    indexes.append(EOS_token)
    return indexes
SOS_token = 0
EOS_token = 1
pairs = load_dataset()
MAX_LENGTH = 50
def sentencetoIndexes(sentence, lang):
    indexes = [lang.word2int[word] for word in sentence.split()]
    indexes.append(EOS_token)
    return indexes

def filterPair(p):
    try:
        return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
    except:
        return False


def filterPairs(pairs):
    return [pair for pair in range(len(pairs)) if filterPair(pair)]

pairs = filterPairs(pairs)
def build_lang(lang1, lang2, max_length=50):
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    input_seq = []
    output_seq = []
    
    for pair in pairs:
        input_lang.addSentence(pair[1])
        output_lang.addSentence(pair[0])
    for pair in pairs:
        input_seq.append(sentencetoIndexes(pair[1], input_lang))
        output_seq.append(sentencetoIndexes(pair[0], output_lang))
    return ( 
             keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_length, padding='post', truncating='post'), 
             keras.preprocessing.sequence.pad_sequences(output_seq, padding='post', truncating='post'), 
             input_lang, output_lang
           )

And when doing

input_tensor, output_tensor, input_lang, output_lang = build_lang('en', 'fra')

Gives this error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-259feac15862> in <module>
----> 1 input_tensor, output_tensor, input_lang, output_lang = build_lang('en', 'fra')

<ipython-input-10-d20934657bc2> in build_lang(lang1, lang2, max_length)
     12         output_seq.append(sentencetoIndexes(pair[0], output_lang))
     13     return keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_length, padding='post',
---> 14                                                       truncating='post'), keras.preprocessing.sequence.pad_sequences(output_seq, padding='post', truncating='post'), input_lang, output_lang

c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\keras\preprocessing\sequence.py in pad_sequences(sequences, maxlen, dtype, padding, truncating, value)
    154           or in case of invalid shape for a `sequences` entry.
    155   """
--> 156   return sequence.pad_sequences(
    157       sequences, maxlen=maxlen, dtype=dtype,
    158       padding=padding, truncating=truncating, value=value)

c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\keras_preprocessing\sequence.py in pad_sequences(sequences, maxlen, dtype, padding, truncating, value)
     75 
     76     if maxlen is None:
---> 77         maxlen = np.max(lengths)
     78 
     79     is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_)

<__array_function__ internals> in amax(*args, **kwargs)

c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\numpy\core\fromnumeric.py in amax(a, axis, out, keepdims, initial, where)
   2665     5
   2666     """
-> 2667     return _wrapreduction(a, np.maximum, 'max', axis, None, out,
   2668                           keepdims=keepdims, initial=initial, where=where)
   2669 

c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\numpy\core\fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
     88                 return reduction(axis=axis, out=out, **passkwargs)
     89 
---> 90     return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
     91 
     92 

ValueError: zero-size array to reduction operation maximum which has no identity

The full code can be downloaded here.

filter_pairs() returns the indices of all pairs that should be included. So I think what you actually want to do is:

filter_idxs = filter_pairs(pairs)
keep_pairs = []
for idx in filter_idxs:
   keep_pairs.append(pairs[idx])

keep_pairs then gives the pairs that should be kept rather than their indices.

I tried manually overwriting pairs with:

pairs = [["test","test2"],["test3","test4"]]

and was able to evaluate the code without error. build_lang expects pairs to be a list of lists, each sublist containing 2 strings. So verify that whatever pairs is matches this template before it is passed to build_lang or you will get an error.

As an aside, you should pass pairs as an argument to build_lang rather than relying on having defined it in a previous code block; this works in some cases but if you say, run blocks out of order, it may produce undesirable results or errors.

def build_lang(lang1,lang2,pairs,max_length = 50):
    ...

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM