python - 如何将 kfold.split() 应用于列表字典?
问题描述
我想通过使用 CrossValidation 来训练 Keras 模型,但我的数据是列表的字典。
我想要 10 次折叠,所以我想要每个验证步骤中 10% 的 dict 键的子集,以及接下来的 10%(带有随机播放)。
示例:对于第一个验证步骤:
pairs_train = {'0': list1,
'1': list2,
'2': list3,
'3': list4,
'4': list5,
'5': list6,
'6': list7,
'7': list8,
'8': list9,
}
pairs_val = {'9': list10,
}
这是我的功能:
def crossValidation(self, k_folds=10):
cv_accuracy_train = []
cv_accuracy_val = []
cv_loss_train = []
cv_loss_val = []
s = pd.Series(pairs)
idx = 0
for train_idx, val_idx in kfold.split(s):
print("=========================================")
print("====== K Fold Validation step => %d/%d =======" % (idx, k_folds))
print("=========================================")
train_gen = DataGenerator(pairs=s[train_idx], batch_size=self.param_grid['batch_size'],
nr_files=len(self.Data.all_files), nr_tests=len(self.Data.all_tests),
negative_ratio=self.param_grid['negative_ratio'])
val_gen = DataGenerator(pairs=s[val_idx], batch_size=self.param_grid['batch_size'],
nr_files=len(self.Data.all_files), nr_tests=len(self.Data.all_tests),
negative_ratio=self.param_grid['negative_ratio'])
# Train
h = self.model.fit(train_gen,
validation_data=val_gen,
epochs=self.param_grid['nb_epochs'],
verbose=2)
cv_accuracy_train.append(np.array(h.history['mae'])[-1])
cv_accuracy_val.append(np.array(h.history['val_mae'])[-1])
cv_loss_train.append(np.array(h.history['loss'])[-1])
cv_loss_val.append(np.array(h.history['val_loss'])[-1])
idx += 1
追溯:
File "/Users/joaolousada/Documents/5ºAno/Master-Thesis/main/Prioritizer/Prioritizer.py", line 173, in crossValidation
train_gen = DataGenerator(pairs=s[train_idx], batch_size=self.param_grid['batch_size'],
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/series.py", line 908, in __getitem__
return self._get_with(key)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/series.py", line 943, in _get_with
return self.loc[key]
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 879, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1099, in _getitem_axis
return self._getitem_iterable(key, axis=axis)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1037, in _getitem_iterable
keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1254, in _get_listlike_indexer
self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
File "/Users/joaolousada/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1298, in _validate_read_indexer
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n ...\n 3257, 3258, 3261, 3262, 3263, 3265, 3266, 3267, 3268, 3269],\n dtype='int64', length=2943)] are in the [index]"
解决方案
如果有一个dict
withlist
值。例如
pairs = {'0': [1,2,3],
'1': [1,2,3],
'2': [4,6,8],
'3': [2,1,9],
'4': [9,7,8],
'5': [4,6,8],
'6': [9,7,8],
'7': [9,7,8],
'8': [1,2,3],
'9': [4,6,8],
}
以下函数将返回索引以按索引拆分字典
def kfold_split(pairs:dict, perc:float, shuffle:bool) -> list:
keys = list(pairs.keys())
sets = len(keys)
cv_perc = int(sets*perc)
folds = int(sets/cv_perc)
indices = []
for fold in range(folds):
# If you want to generate random keys
if shuffle:
# Choose random keys
random_keys = list(np.random.choice(keys, cv_perc))
other_keys = list(set(keys) - set(random_keys))
indices.append((other_keys, random_keys))
else:
if fold == 0:
fold_keys = keys[-cv_perc*(fold+1):]
else:
fold_keys = keys[-cv_perc*(fold+1):-cv_perc*(fold)]
other_keys = list(set(keys) - set(fold_keys))
indices.append((other_keys, fold_keys))
return indices
您可以检索随机索引
kfold_split(pairs, perc=.2, shuffle=True)
>>>
[(['6', '2', '1', '5', '4', '7', '0', '3'], ['9', '8']),
(['6', '1', '9', '5', '4', '7', '0', '3'], ['8', '2']),
(['2', '1', '8', '9', '5', '4', '7', '3'], ['6', '0']),
(['2', '8', '9', '5', '4', '7', '0', '3'], ['1', '6']),
(['6', '2', '8', '5', '4', '7', '0', '3'], ['9', '1'])]
或订单索引
kfold_split(pairs, perc=.2, shuffle=False)
>>>
[(['6', '2', '1', '5', '4', '7', '0', '3'], ['8', '9']),
(['2', '1', '8', '9', '5', '4', '0', '3'], ['6', '7']),
(['6', '2', '1', '8', '9', '7', '0', '3'], ['4', '5']),
(['6', '1', '8', '9', '5', '4', '7', '0'], ['2', '3']),
(['6', '2', '8', '9', '5', '4', '7', '3'], ['0', '1'])]
然后您可以根据这些索引过滤您的字典,如下所示
for indices in result:
train_indices, test_indices = indices
# Filter dict by indices
pair_test = {k:v for k,v in pairs.items() if k in test_indices}
# Train data
pair_train = {k:v for k,v in pairs.items() if k not in train_indices}
# Some other stuff here
推荐阅读
- google-apps-script - 如何使用 Google Apps 脚本检索用户为 Google 表单中的其他字段输入的值?
- php - Nginx 502 坏网关 + PHP 7.2 FPM
- python - 重塑熊猫数据框
- html - 如何在 html 中为 AngularJS 中的多个 json 值添加公共标题
- python - a.transpose().ravel()[0]=x 不能改变numpy中原始数组的值?
- excel - 在 Numbers/Excel 中,如何仅对匹配特定条件的行执行 SUMPRODUCT
- javascript - 如何对所有分页项目应用过滤器并仅显示过滤后的项目?
- apache-spark - pyspark creating BlockMatrix from matrices of different size
- r - 没有显示闪亮的情节
- javascript - 如何在没有 jquery 的情况下制作滑块图片库