首页 > 解决方案 > Python throwing list index out of range error for data generator Tensorflow Keras Functional API

问题描述

我正在使用 Keras 功能 API 制作英语到西班牙语的翻译程序,我的第一个错误是正在加载的数据导致 RAM 过载。我通过添加 DataGenerator 修复了这个问题(或者更确切地说,我正在尝试修复它)类一次只能分批加载 32 个数据点,而不是完整的 112k 数据点。我遇到的问题是代码由于 List Index out of Range 错误而不断崩溃,我不确定它来自哪里。我知道模型本身的代码很好,因为它可以正确编译,但是当它即将开始训练时,它会崩溃。我尝试过移动代码,更改生成器的加载方式,我认为甚至更改了 DataGen 输出的暗淡,但它一直在崩溃。

确切的错误信息:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-5-cad2048bcf92> in <module>
    132 list_ids = sorted(np.load('../input/spa-eng-separated/archive/list_ids.npy'))
    133 train_gen = DataGenerator(list_ids, n_eng, n_es)
--> 134 ED2.train(train_gen, EPOCHS)
    135 
    136 test_string = "hi how are you doing"

<ipython-input-4-5d44382e543b> in train(self, train_gen, epochs)
     48             x=train_gen, # train_gen will return tuple ([encoder_in, decoder_in], decoder_out)
     49             epochs=epochs,
---> 50             verbose = 2#,
     51 #             workers = 6,
     52 #             use_multiprocessing = True

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
    106   def _method_wrapper(self, *args, **kwargs):
    107     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
--> 108       return method(self, *args, **kwargs)
    109 
    110     # Running inside `run_distribute_coordinator` already.

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1061           use_multiprocessing=use_multiprocessing,
   1062           model=self,
-> 1063           steps_per_execution=self._steps_per_execution)
   1064 
   1065       # Container that configures and calls `tf.keras.Callback`s.

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weight, batch_size, steps_per_epoch, initial_epoch, epochs, shuffle, class_weight, max_queue_size, workers, use_multiprocessing, model, steps_per_execution)
   1115         use_multiprocessing=use_multiprocessing,
   1116         distribution_strategy=ds_context.get_strategy(),
-> 1117         model=model)
   1118 
   1119     strategy = ds_context.get_strategy()

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, shuffle, workers, use_multiprocessing, max_queue_size, model, **kwargs)
    914         max_queue_size=max_queue_size,
    915         model=model,
--> 916         **kwargs)
    917 
    918   @staticmethod

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, workers, use_multiprocessing, max_queue_size, model, **kwargs)
    794           lambda x: model(x, training=False), args=(concrete_x,))
    795 
--> 796     self._first_batch_size = int(nest.flatten(peek)[0].shape[0])
    797 
    798     def _get_dynamic_shape(t):

/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py in __getitem__(self, key)
    885       else:
    886         if self._v2_behavior:
--> 887           return self._dims[key].value
    888         else:
    889           return self._dims[key]

IndexError: list index out of range

我所能弄清楚的是,由于与 model.fit() 和火车生成器有关的某些东西,代码正在崩溃。代码如下。

import tensorflow.keras as keras
import numpy as np

"""
Borrowed code from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
(modified some of it but used that site for base)
"""

class DataGenerator(keras.utils.Sequence):
# class DataGenerator():
# tried removing the superclass keras.utils.Sequence

    def __init__(self, list_IDs, n_en, n_es, batch_size=32, shuffle=True):
        # Initializing the generator
        
#         self.dim=dim 
        self.batch_size=batch_size
        self.dim = (32, 51, 53)
#         self.labels=labels
        self.list_id=list_IDs
#         self.n_chan=n_channels
#         self.n_class=n_classes
        self.shuffle=shuffle
        
        self.n_eng = n_en
        self.n_esp = n_es
        
        self.on_epoch_end()
        
    def on_epoch_end(self):
        self.idx = np.arange(len(self.list_id))
        
        # Makes sure that batches change between epochs
        if self.shuffle:
            np.random.shuffle(self.idx)
            
    def __len__(self):
        return int(np.floor(len(self.list_id)) // self.batch_size) # n_epochs per batch | the // operator divides but turns it into an int as well w/o the explicit int(x) call
    
    def __getitem__(self, idx):
        # generate indices of bat
        idxs = self.idx[(idx*self.batch_size):((idx+1)*self.batch_size)]
    
        temp_idx = [self.list_id[k] for k in idxs]
        
        enc, d_in, d_out = self.__data_generation(temp_idx) 
        
        return ([enc, d_in], d_out)
    
    def __data_generation(self, temp_ids):
            
        """
        The way this will work is basically by loading in the data at the id in a saved np file
        Then we'll read it in as the 2d matrix (a list of tensors where each tensor is a list of indices/values) and transform it into a 3d matrix in the style of the sep_data method
        """
        
#         en = np.empty((self.bat, *self.dim, self.n_chan))
#         es = np.empty((self.bat, *self.dim, self.n_chan))
        en = []
        es = []
        
        # data gen
        for i, ID in enumerate(temp_ids):
            # load samples
            en.append(np.load('../input/spa-eng-separated/archive/' + str(ID) + '_en.npy'))
            es.append(np.load('../input/spa-eng-separated/archive/' + str(ID) + '_es.npy'))
            
        enc_in = np.ndarray([])
        dec_in = np.ndarray([])
        dec_out = np.ndarray([])
        for i, x in enumerate(en):
            e_temp, d_temp, o_temp = sep_data(en[i], es[i], self.n_eng, self.n_esp)

            np.append(enc_in, e_temp)
            np.append(dec_in, d_temp)
            np.append(dec_out, o_temp)
            # separating the data into an encoder inp, decoder inp, decoder out

        enc_in.astype('int')
        dec_in.astype('int')
        dec_out.astype('int')

        return enc_in, dec_in, dec_out

import tensorflow.keras as keras
import keras.layers as layers
import numpy as np

class EnDe2():
    def __init__(self, inp_size, targ_size, embedding_dim, units, bat, input_vocab, target_vocab):
        
        self.bat_size = bat
        
        # input -> encoder embedding -> encoder GRU -> 
        # save states -> decoder input -> decoder GRU w encoder states -> 
        # decoder LSTM -> dense w softmax activ
        
        
        enc_in = layers.Input(shape=(None,))
#         print(input_vocab, embedding_dim)
        enc_out = layers.Embedding(input_dim=input_vocab+1, output_dim=embedding_dim//2)(enc_in)
        enc_out, state = layers.GRU(units//2, 
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')(enc_out)

        dec_in = layers.Input(shape=(None,))
#         print(target_vocab, embedding_dim
        dec_out = layers.Embedding(input_dim=target_vocab+1, output_dim=embedding_dim//2)(dec_in)
        dec_out = layers.GRU(units//2)(dec_out, initial_state=state)
        dec_out = layers.Dense(targ_size, activation='softmax')(dec_out)
        
        self.model = keras.models.Model([enc_in, dec_in], dec_out)
        
        self.model.compile(optimizer='adam', loss='categorical_crossentropy')
        self.model.summary()
        
    def train(self, train_gen, epochs):
        self.model.compile(
            optimizer="rmsprop", 
            loss="categorical_crossentropy", 
            metrics=["accuracy"]
        )
#         self.model.fit(
#             [e, d_in],
#             d_out,
#             batch_size=self.bat_size,
#             epochs=epochs,
#             validation_split=0.2, 
#         )
    
        self.model.fit(
            x=train_gen, # train_gen will return tuple ([encoder_in, decoder_in], decoder_out)
            epochs=epochs, 
#             verbose = 2#,
            workers = 6,
            use_multiprocessing = True
            
        )
    
#         self.model.fit_generator(
#             generator=train_gen, 
#             validation_data=eval_gen, 
#             use_multiprocessing=True,
#             workers=6
#         )

        self.model.save("./s2s")
        
        

我正在尝试使用 Seq2Seq 模型(编码器解码器),除了单词而不是基本字母。抱歉,如果上面的代码难以阅读 - 这是我用来运行的在线笔记本的(链接)[https://www.kaggle.com/aaravgupta4/engtospanish-translation/edit?rvi=1]代码(并将其划分为单元格;https://www.kaggle.com/aaravgupta4/engtospanish-translation/edit?rvi=1)。

我的猜测是超类(keras.utils.Sequence)需要 DataGenerator 子类具有 dims 属性,但我在文档中找不到任何内容,最重要的是,我不记得添加了 dims 变量到 DataGenerator 类的帮助。

任何帮助,将不胜感激。谢谢!

标签: pythontensorflowkerasdeep-learning

解决方案


推荐阅读