首页 > 解决方案 > Keras google word2vec CNN model InvalidArgumentError

问题描述

I built a text classification model for imbalanced class classification data. Instead of using keras word vector, i used embedding by using googlenews word2vec vector as the baseline in the embedding layer.

import pandas as pd
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D, Bidirectional, LSTM, Input, concatenate, Conv1D, GlobalMaxPooling1D, BatchNormalization


from keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import keras.backend as K
from keras import backend as K
from keras import metrics

import numpy as np
from itertools import chain
from collections import Counter
from sklearn.utils import shuffle

import nltk
import gensim
from gensim.models import KeyedVectors

from sklearn.utils import class_weight


dat = pd.read_csv('/home/data.csv',encoding='latin',delimiter='\t')

dat = shuffle(dat)
dat.reset_index(drop=True,inplace=True)

Since this is a class imbalance problem, i used f1 metric.

def f1_metric(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

I processed text and created word vector as below

def preprocess(dat):
    return [nltk.word_tokenize(row) for row in dat]

x_train, x_test, y_train, y_test= train_test_split(dat.text,dat.labels,test_size=0.20)

X = preprocess(x_train)
model = KeyedVectors.load_word2vec_format('/home/user/Downloads/GoogleNews-vectors-negative300.bin', binary=True,limit=100000)

I use this function to convert an array of text to number values from word2vec model.

def word2idx(word):
    return model.wv.vocab[word].index

vocab_size, emdedding_size = model.wv.syn0.shape
pretrained_weights = model.wv.syn0
print(vocab_size, emdedding_size)
100000 300

I created the matrix

max_sentence_len = 50
train_x = np.zeros([len(X), max_sentence_len], dtype=np.int32)

And replace 0 with index values from word2vec model against corresponding tokenized words, upto maximum of 50 words.

for i in range(len(X)):
    for j in range(len(X[i])):
        try:
            train_x[i][j] = word2idx(X[i][j])
        except:
            pass

I computed class weights using sklearn function since this is a class imbalance problem.

class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)

This is the function to create multiConvnet model.

def model_architecture(vocab_size,emdedding_size,pretrained_weights):

    # vector-space embedding: 
    n_dim = 64
    n_unique_words = 5000 
    max_review_length = 50
    pad_type = trunc_type = 'pre'
    drop_embed = 0.2 

    # convolutional layer architecture:
    n_conv_1 = n_conv_2 = n_conv_3 = n_conv_4= 256
    k_conv_1 = 3
    k_conv_2 = 2
    k_conv_3 = 4
    k_conv_4 = 5

    # dense layer architecture: 
    n_dense = 256
    dropout = 0.2

    input_layer = Input(shape=(max_review_length,), dtype='int16', name='input') # supports integers +/- 32.7k

#    embedding_layer = Embedding(n_unique_words, n_dim, input_length=max_review_length, name='embedding')(input_layer)
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights], name='embedding')(input_layer)
    drop_embed_layer = SpatialDropout1D(drop_embed, name='drop_embed')(embedding_layer)

    conv_1 = Conv1D(n_conv_1, k_conv_1, activation='relu', name='conv_1')(drop_embed_layer)
    maxp_1 = GlobalMaxPooling1D(name='maxp_1')(conv_1)

    conv_2 = Conv1D(n_conv_2, k_conv_2, activation='relu', name='conv_2')(drop_embed_layer)
    maxp_2 = GlobalMaxPooling1D(name='maxp_2')(conv_2)

    conv_3 = Conv1D(n_conv_3, k_conv_3, activation='relu', name='conv_3')(drop_embed_layer)
    maxp_3 = GlobalMaxPooling1D(name='maxp_3')(conv_3)

    concat = concatenate([maxp_1, maxp_2, maxp_3])

    dense_layer = Dense(n_dense, activation='relu', name='dense')(concat)
    drop_dense_layer = Dropout(dropout, name='drop_dense')(dense_layer)
    dense_2 = Dense(64, activation='relu', name='dense_2')(drop_dense_layer)
    dropout_2 = Dropout(dropout, name='drop_dense_2')(dense_2)

    predictions = Dense(units=1, activation='sigmoid', name='output')(dropout_2)
    model = Model(input_layer, predictions)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_metric])
    return model

My model is below

mod_keras = model_architecture(vocab_size,emdedding_size,pretrained_weights)

mod_keras.fit(train_x,y_train,batch_size=32,epochs=2,verbose=1,validation_split=0.2,class_weight=class_weights)

when i run this, i am getting below error.

Train on 287895 samples, validate on 71974 samples
Epoch 1/2
---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-25-fcb6fa008311> in <module>
----> 1 mod_Access.fit(train_x,y_train_Access,batch_size=32,epochs=2,verbose=1,validation_split=0.2,class_weight=class_weights)

~/.local/lib/python3.5/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
   1037                                         initial_epoch=initial_epoch,
   1038                                         steps_per_epoch=steps_per_epoch,
-> 1039                                         validation_steps=validation_steps)
   1040 
   1041     def evaluate(self, x=None, y=None,

~/.local/lib/python3.5/site-packages/keras/engine/training_arrays.py in fit_loop(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
    197                     ins_batch[i] = ins_batch[i].toarray()
    198 
--> 199                 outs = f(ins_batch)
    200                 outs = to_list(outs)
    201                 for l, o in zip(out_labels, outs):

~/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2713                 return self._legacy_call(inputs)
   2714 
-> 2715             return self._call(inputs)
   2716         else:
   2717             if py_any(is_tensor(x) for x in inputs):

~/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in _call(self, inputs)
   2673             fetched = self._callable_fn(*array_vals, run_metadata=self.run_metadata)
   2674         else:
-> 2675             fetched = self._callable_fn(*array_vals)
   2676         return fetched[:len(self.outputs)]
   2677 

~/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py in __call__(self, *args, **kwargs)
   1437           ret = tf_session.TF_SessionRunCallable(
   1438               self._session._session, self._handle, args, status,
-> 1439               run_metadata_ptr)
   1440         if run_metadata:
   1441           proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

~/.local/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
    526             None, None,
    527             compat.as_text(c_api.TF_Message(self.status.status)),
--> 528             c_api.TF_GetCode(self.status.status))
    529     # Delete the underlying status object from memory otherwise it stays alive
    530     # as there is a reference to status from this from the traceback due to

InvalidArgumentError: indices[26,0] = -3338 is not in [0, 100000)
     [[{{node embedding/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training/Adam/Assign_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding/embeddings/read, embedding/Cast, training/Adam/gradients/embedding/embedding_lookup_grad/concat/axis)]]

I did read this post InvalidArgumentError (see above for traceback): indices[1] = 10 is not in [0, 10)

As per this post i need to set vocabulary. In my case, this is exactly what i have done by using the parameter vocab_size.

标签: pythonkerasdeep-learningword2vecword-embedding

解决方案


推荐阅读