首页 > 解决方案 > 基于内部值的 Numpy 数组操作

问题描述

我正在尝试完成一项奇怪的任务。我需要在不使用 sklearn 的情况下完成以下操作,最好使用 numpy:

  1. 给定一个数据集,将数据分成 5 个相等的“折叠”或分区
  2. 在每个分区内,将数据拆分为“训练”和“测试”集,拆分为 80/20
  3. 这是一个问题:您的数据集已标记为类。以一个有 100 个实例的数据集为例,A 类有 33 个样本,B 类有 67 个样本。我应该创建 20 个数据实例的 5 个折叠,其中在每个折叠中,A 类有 6 或 7 (1/3) 个值,而 B 类有其余的

我的问题是: 我不知道如何正确地为每个折叠返回一个测试和训练集,尽管能够适当地拆分它而且更重要的是,我不知道如何正确划分每个类的 # 个元素.

我当前的代码在这里。有人评论我卡在哪里:

import numpy

def csv_to_array(file):
    # Open the file, and load it in delimiting on the ',' for a comma separated value file
    data = open(file, 'r')
    data = numpy.loadtxt(data, delimiter=',')

    # Loop through the data in the array
    for index in range(len(data)):
        # Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
        try:
            data[index] = [float(x) for x in data[index]]
        except Exception:
            data[index] = 0
        except ValueError:
            data[index] = 0

    # Return the now type-formatted data
    return data

def five_cross_fold_validation(dataset):
    # print("DATASET", dataset)
    numpy.random.shuffle(dataset)
    num_rows = dataset.shape[0]
    split_mark = int(num_rows / 5)
    folds = []
    temp1 = dataset[:split_mark]
    # print("TEMP1", temp1)
    temp2 = dataset[split_mark:split_mark*2]
    # print("TEMP2", temp2)
    temp3 = dataset[split_mark*2:split_mark*3]
    # print("TEMP3", temp3)
    temp4 = dataset[split_mark*3:split_mark*4]
    # print("TEMP4", temp4)
    temp5 = dataset[split_mark*4:]
    # print("TEMP5", temp5)
    folds.append(temp1)
    folds.append(temp2)
    folds.append(temp3)
    folds.append(temp4)
    folds.append(temp5)
    # folds = numpy.asarray(folds)

    for fold in folds:
        # fold = numpy.asarray(fold)
        num_rows = fold.shape[0]
        split_mark = int(num_rows * .8)

        fold_training = fold[split_mark:]
        fold_testing = fold[:split_mark]

        print(type(fold))
        # fold.tolist()
        list(fold)
        print(type(fold))
        del fold[0:len(fold)]
        fold.append(fold_training)
        fold.append(fold_testing)
        fold = numpy.asarray(fold)




        # Somehow, return a testing and training set within each fold

    # print(folds)

    return folds

def confirm_size(folds):
    total = 0
    for fold in folds:
        curr = len(fold)
        total = total + curr
    return total


def main():
    print("BEGINNING CFV")
    ecoli = csv_to_array('Classification/ecoli.csv')
    print(len(ecoli))
    folds = five_cross_fold_validation(ecoli)
    size = confirm_size(folds)
    print(size)

main()

此外,作为参考,我附上了我正在使用的 csv(它是对UCI Ecoli Dataset的修改。)这里的类是最后一列中的值。所以 0, 1, 2, 3, 4。重要的是要注意每个类别的数量不相等。

        0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
        0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
        0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
        0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
        0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
        0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
        0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
        0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
        0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
        0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
        0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
        0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
        0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
        0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
        0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
        0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
        0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
        0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
        0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
        0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
        0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
        0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
        0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
        0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
        0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
        0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
        0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
        0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
        0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
        0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
        0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
        0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
        0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
        0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
        0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
        0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
        0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
        0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
        0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
        0,0.38,0.48,0.5,0.42,0.48,0.55,0
        0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
        0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
        0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
        0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
        0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
        0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
        0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
        0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
        0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
        0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
        0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
        0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
        0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
        0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
        0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
        0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
        0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
        0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
        0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
        0.27,0.35,0.48,0.5,0.51,0.77,0.79,1

标签: pythonpython-3.xnumpymachine-learningnumpy-ndarray

解决方案


编辑我替换np.random.shuffle(A)A = np.random.permutation(A),唯一的区别是它不会改变输入数组。这对这段代码没有任何影响,但总的来说它更安全。

这个想法是通过使用随机采样输入numpy.random.permutation。一旦行被打乱,我们只需要遍历所有可能的测试集(所需大小的滑动窗口,这里是输入大小的 20%)。相应的训练集只是由所有剩余的元素组成。

这将保留所有子集上的原始类分布,即使我们按顺序选择它们,因为我们打乱了输入。

以下代码迭代测试/训练集组合:

import numpy as np

def csv_to_array(file):
  with open(file, 'r') as f:
    data = np.loadtxt(f, delimiter=',')
  return data

def classes_distribution(A):
  """Print the class distributions of array A."""
  nb_classes = np.unique(A[:,-1]).shape[0]
  total_size = A.shape[0]
  for i in range(nb_classes):
    class_size = sum(row[-1] == i for row in A)
    class_p = class_size/total_size
    print(f"\t P(class_{i}) = {class_p:.3f}")

def random_samples(A, test_set_p=0.2):
  """Split the input array A in two uniformly chosen 
  random sets: test/training.
  Repeat this until all rows have been yielded once at least 
  once as a test set."""
  A = np.random.permutation(A)
  sample_size = int(test_set_p*A.shape[0])
  for start in range(0, A.shape[0], sample_size):
    end = start + sample_size
    yield {
      "test": A[start:end,], 
      "train": np.append(A[:start,], A[end:,], 0)
    }

def main():
  ecoli = csv_to_array('ecoli.csv')
  print("Input set shape: ", ecoli.shape)
  print("Input set class distribution:")
  classes_distribution(ecoli)
  print("Training sets class distributions:")
  for iteration in random_samples(ecoli):
    test_set = iteration["test"]
    training_set = iteration["train"]
    classes_distribution(training_set)
    print("---")
    # ... Do what ever with these two sets

main()

它产生以下形式的输出:

Input set shape:  (169, 8)
Input set class distribution:
     P(class_0) = 0.308
     P(class_1) = 0.213
     P(class_2) = 0.207
     P(class_3) = 0.118
     P(class_4) = 0.154
Training sets class distributions:
     P(class_0) = 0.316
     P(class_1) = 0.206
     P(class_2) = 0.199
     P(class_3) = 0.118
     P(class_4) = 0.162
...

推荐阅读