I am trying to run the following code in google compute engine:
import itertools
import math
import pandas as pd
import os
import numpy as np
ner_df = pd.read_csv('ner_dataset.csv', encoding = 'ISO-8859-1')
sentences_words = []
sentences_tags = []
curr_sent_num = -1
current_sentence_words = []
current_sentence_tags = []
for sent_num, word, tag in ner_df[['Sentence #', 'Word', 'Tag']].values:
if isinstance(sent_num, str) and 'Sentence: ' in sent_num:
curr_sent_num = int(sent_num.split(':')[1].strip())
if current_sentence_words and current_sentence_tags:
sentences_words.append(current_sentence_words)
sentences_tags.append(current_sentence_tags)
current_sentence_words = []
current_sentence_tags = []
current_sentence_words.append(word)
current_sentence_tags.append(tag)
len(sentences_words), len(sentences_tags)
train_size = int(len(sentences_words) * 0.8)
train_sentences_words = sentences_words[:train_size]
train_sentences_tags = sentences_tags[:train_size]
test_sentences_words = sentences_words[train_size:]
test_sentences_tags = sentences_tags[train_size:]
print('Train:', len(train_sentences_words), len(train_sentences_tags))
print('Test:', len(test_sentences_words), len(test_sentences_tags))
vocab = set(itertools.chain(*[[w for w in s] for s in train_sentences_words]))
tags = set(itertools.chain(*[[w for w in s] for s in train_sentences_tags]))
sentenecs_lens = map(len, train_sentences_words)
print(len(vocab), len(tags), len(list(sentenecs_lens)))
MAX_LEN = 75#max(sentenecs_lens)
VOCAB_SIZE = len(vocab)
print('VOCAB_SIZE:', VOCAB_SIZE)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
words_tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters=[], oov_token='__UNKNOWN__')
words_tokenizer.fit_on_texts(map(lambda s: ' '.join(s), train_sentences_words))
word_index = words_tokenizer.word_index
word_index['__PADDING__'] = 0
index_word = {i:w for w, i in word_index.items()}
print('Unique tokens:', len(word_index))
train_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), train_sentences_words))
test_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), test_sentences_words))
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LEN)
test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LEN)
print(train_sequences_padded.shape, test_sequences_padded.shape)
tags_tokenizer = Tokenizer(num_words=len(tags), filters='', oov_token='__UNKNOWN__', lower=False)
tags_tokenizer.fit_on_texts(map(lambda s: ' '.join(s), train_sentences_tags))
tag_index = tags_tokenizer.word_index
tag_index['__PADDING__'] = 0
index_tag = {i:w for w, i in tag_index.items()}
index_tag_wo_padding = dict(index_tag)
index_tag_wo_padding[tag_index['__PADDING__']] = '0'
print('Unique tags:', len(tag_index))
train_tags = tags_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), train_sentences_tags))
test_tags = tags_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), test_sentences_tags))
train_tags_padded = pad_sequences(train_tags, maxlen=MAX_LEN)
test_tags_padded = pad_sequences(test_tags, maxlen=MAX_LEN)
train_tags_padded = np.expand_dims(train_tags_padded, -1)
test_tags_padded = np.expand_dims(test_tags_padded, -1)
print(train_tags_padded.shape, test_tags_padded.shape)
for w, t in zip(train_sequences_padded[123], train_tags_padded[123]):
print(index_word[w], index_tag[t[0]])
from keras.layers import Dense, Input, LSTM, Embedding, Bidirectional, Dropout
from keras.models import Model
from keras.initializers import Constant
embeddings = {}
with open('glove.6B.50d.txt') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings[word] = coefs
print('# vectors:', len(embeddings))
# prepare embedding matrix
num_words = min(VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, 50))
for word, i in word_index.items():
if i >= VOCAB_SIZE:
continue
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
pretrained_embedding_layer = Embedding(VOCAB_SIZE, 50, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_LEN, trainable=False)
sequence_input = Input(shape=(MAX_LEN,), dtype='int32')
embedded_sequences = pretrained_embedding_layer(sequence_input)
...
I get the following error at the last statement:
TypeError: Eager execution of tf.constant with unsupported shape (value has 1441550 elements, shape is (31815, 50) with 1590750 elements).
However, I can run the code without any problem in Colab. How can I fix this problem?
It looks like the error is due to the Embedding layer. Probably that keras's Constant initializer does not work with tensorflow's eager execution. If you are going to use trainable = False, I'd recommend using tf.nn.embedding_lookup which is similar to keras's Embedding API.
The error occurs because the input_dim
of the Embedding layer (aka vocabulary size) does not match the number of words in the embedding matrix passed to embeddings_initializer
. In other words, to solve this issue you need to ensure that embedding_matrix.shape[0] == input_dim
.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.