python - Keras google word2vec CNN model InvalidArgumentError
问题描述
I built a text classification model for imbalanced class classification data. Instead of using keras word vector, i used embedding by using googlenews word2vec vector as the baseline in the embedding layer.
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D, Bidirectional, LSTM, Input, concatenate, Conv1D, GlobalMaxPooling1D, BatchNormalization
from keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import keras.backend as K
from keras import backend as K
from keras import metrics
import numpy as np
from itertools import chain
from collections import Counter
from sklearn.utils import shuffle
import nltk
import gensim
from gensim.models import KeyedVectors
from sklearn.utils import class_weight
dat = pd.read_csv('/home/data.csv',encoding='latin',delimiter='\t')
dat = shuffle(dat)
dat.reset_index(drop=True,inplace=True)
Since this is a class imbalance problem, i used f1 metric.
def f1_metric(y_true, y_pred):
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
precision = precision(y_true, y_pred)
recall = recall(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
I processed text and created word vector as below
def preprocess(dat):
return [nltk.word_tokenize(row) for row in dat]
x_train, x_test, y_train, y_test= train_test_split(dat.text,dat.labels,test_size=0.20)
X = preprocess(x_train)
model = KeyedVectors.load_word2vec_format('/home/user/Downloads/GoogleNews-vectors-negative300.bin', binary=True,limit=100000)
I use this function to convert an array of text to number values from word2vec model.
def word2idx(word):
return model.wv.vocab[word].index
vocab_size, emdedding_size = model.wv.syn0.shape
pretrained_weights = model.wv.syn0
print(vocab_size, emdedding_size)
100000 300
I created the matrix
max_sentence_len = 50
train_x = np.zeros([len(X), max_sentence_len], dtype=np.int32)
And replace 0 with index values from word2vec model against corresponding tokenized words, upto maximum of 50 words.
for i in range(len(X)):
for j in range(len(X[i])):
try:
train_x[i][j] = word2idx(X[i][j])
except:
pass
I computed class weights using sklearn function since this is a class imbalance problem.
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
This is the function to create multiConvnet model.
def model_architecture(vocab_size,emdedding_size,pretrained_weights):
# vector-space embedding:
n_dim = 64
n_unique_words = 5000
max_review_length = 50
pad_type = trunc_type = 'pre'
drop_embed = 0.2
# convolutional layer architecture:
n_conv_1 = n_conv_2 = n_conv_3 = n_conv_4= 256
k_conv_1 = 3
k_conv_2 = 2
k_conv_3 = 4
k_conv_4 = 5
# dense layer architecture:
n_dense = 256
dropout = 0.2
input_layer = Input(shape=(max_review_length,), dtype='int16', name='input') # supports integers +/- 32.7k
# embedding_layer = Embedding(n_unique_words, n_dim, input_length=max_review_length, name='embedding')(input_layer)
embedding_layer = Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights], name='embedding')(input_layer)
drop_embed_layer = SpatialDropout1D(drop_embed, name='drop_embed')(embedding_layer)
conv_1 = Conv1D(n_conv_1, k_conv_1, activation='relu', name='conv_1')(drop_embed_layer)
maxp_1 = GlobalMaxPooling1D(name='maxp_1')(conv_1)
conv_2 = Conv1D(n_conv_2, k_conv_2, activation='relu', name='conv_2')(drop_embed_layer)
maxp_2 = GlobalMaxPooling1D(name='maxp_2')(conv_2)
conv_3 = Conv1D(n_conv_3, k_conv_3, activation='relu', name='conv_3')(drop_embed_layer)
maxp_3 = GlobalMaxPooling1D(name='maxp_3')(conv_3)
concat = concatenate([maxp_1, maxp_2, maxp_3])
dense_layer = Dense(n_dense, activation='relu', name='dense')(concat)
drop_dense_layer = Dropout(dropout, name='drop_dense')(dense_layer)
dense_2 = Dense(64, activation='relu', name='dense_2')(drop_dense_layer)
dropout_2 = Dropout(dropout, name='drop_dense_2')(dense_2)
predictions = Dense(units=1, activation='sigmoid', name='output')(dropout_2)
model = Model(input_layer, predictions)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_metric])
return model
My model is below
mod_keras = model_architecture(vocab_size,emdedding_size,pretrained_weights)
mod_keras.fit(train_x,y_train,batch_size=32,epochs=2,verbose=1,validation_split=0.2,class_weight=class_weights)
when i run this, i am getting below error.
Train on 287895 samples, validate on 71974 samples
Epoch 1/2
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-25-fcb6fa008311> in <module>
----> 1 mod_Access.fit(train_x,y_train_Access,batch_size=32,epochs=2,verbose=1,validation_split=0.2,class_weight=class_weights)
~/.local/lib/python3.5/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1037 initial_epoch=initial_epoch,
1038 steps_per_epoch=steps_per_epoch,
-> 1039 validation_steps=validation_steps)
1040
1041 def evaluate(self, x=None, y=None,
~/.local/lib/python3.5/site-packages/keras/engine/training_arrays.py in fit_loop(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
197 ins_batch[i] = ins_batch[i].toarray()
198
--> 199 outs = f(ins_batch)
200 outs = to_list(outs)
201 for l, o in zip(out_labels, outs):
~/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
2713 return self._legacy_call(inputs)
2714
-> 2715 return self._call(inputs)
2716 else:
2717 if py_any(is_tensor(x) for x in inputs):
~/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in _call(self, inputs)
2673 fetched = self._callable_fn(*array_vals, run_metadata=self.run_metadata)
2674 else:
-> 2675 fetched = self._callable_fn(*array_vals)
2676 return fetched[:len(self.outputs)]
2677
~/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py in __call__(self, *args, **kwargs)
1437 ret = tf_session.TF_SessionRunCallable(
1438 self._session._session, self._handle, args, status,
-> 1439 run_metadata_ptr)
1440 if run_metadata:
1441 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/.local/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
526 None, None,
527 compat.as_text(c_api.TF_Message(self.status.status)),
--> 528 c_api.TF_GetCode(self.status.status))
529 # Delete the underlying status object from memory otherwise it stays alive
530 # as there is a reference to status from this from the traceback due to
InvalidArgumentError: indices[26,0] = -3338 is not in [0, 100000)
[[{{node embedding/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training/Adam/Assign_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding/embeddings/read, embedding/Cast, training/Adam/gradients/embedding/embedding_lookup_grad/concat/axis)]]
I did read this post InvalidArgumentError (see above for traceback): indices[1] = 10 is not in [0, 10)
As per this post i need to set vocabulary. In my case, this is exactly what i have done by using the parameter vocab_size
.
解决方案
推荐阅读
- linux - 如何防止 docker 在错误时停止/删除容器
- c# - 动态创建的用户控件内的事件不会触发
- ruby - Ruby - 按列对 CSV 文件中的数据进行分组
- sails.js - 在 Sail.js 的控制器上创建用户定义的角色和动态应用 ACL 的最佳方法是什么?
- android - 在 libGDX 中缩放相机不保存尺寸
- android - Kotlin 中的图像共享意图问题
- azure-storage - 在 Azure Functions 中管理大量应用程序设置
- c++ - Eigen:如果我只能计算 Aty 和 Ax,是否可以创建类似 LeastSquareDiagonalPreconditioner 的调节器?
- windows - 可以在 Windows 上检测到文件副本吗?
- qt - Qt:如何找到鼠标点击相对于图像的位置