1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Jan 24 19:01:40 2018 4 5 @author: markli 6 7 采用信息增益作为特征选择原则构建决策树 8 """ 9 import numpy as np; 10 import pandas as pd; 11 12 class DecisionTree(object): 13 def __init__(self,features): 14 """ 15 features 样本具有的特征的取值范围,例如具有A1,A2两个特征,features=[A1,A2] 16 """ 17 self.features = features; 18 19 def fit(self,TrainData): 20 """ 21 TrainData 训练样本,数据格式为二维数组m*n 22 m 个样本,n-1 个特征,最后一列是类别标签 23 """ 24 tree = self.GetDicTree(TrainData,self.features); 25 return tree; 26 27 28 def SetEntropy(self,Data): 29 """ 30 获取数据集的经验熵 31 Data 数据集的最后一列,类别 32 """ 33 N = len(Data[:,-1]); #获取数据集的大小 34 Numoflabel = pd.value_counts(Data[:,-1]); #得数据集中到每一类别的数量 35 classlabel = list(set(Data[:,-1])); 36 entropy = 0;# 数据集的信息熵 37 for c in classlabel: 38 try: 39 Ck = Numoflabel[c]; #得到类别为c的样例的数量 40 entropy = entropy - Ck/N * np.log2(Ck/N); 41 except KeyError: 42 Ck = 0; 43 entropy = entropy - Ck; 44 45 return entropy; 46 47 def ConditionEntropy(self,Data,index): 48 """ 49 获取某一特征的条件经验熵 50 Data 数据集与TrainData格式一致 51 feature 特征的取值范围 例如 A1=[1,2,3] 52 feature_index 该特征在数据集中属于第几个特征,从0开始 53 """ 54 ConEntropy = 1; 55 feature_value = list(set(Data[:,index])); 56 N = len(Data[:,0]); 57 for a in feature_value: 58 d = Data[np.where(Data[:,index]==a)]; 59 d_n = len(d); 60 if(d_n == 0): 61 return 0; 62 #计算特征取a值时的数据集的经验熵 63 d_entropy = self.SetEntropy(d); 64 ConEntropy = ConEntropy * (d_n / N) * d_entropy; 65 66 return -ConEntropy; 67 68 def SelectBestFeature(self,Data): 69 """ 70 选出数据集中最大信息增益的特征及最优特征 71 Data 数据集与TrainData格式一致 72 """ 73 AddEntropy = []; 74 entropy = self.SetEntropy(Data); #求得数据集的经验熵 75 feature_num = len(Data[0])-1; #获得数据集中特征数量 76 for i in range(feature_num): 77 ConEntropy = self.ConditionEntropy(Data,i); #求得每个特征的条件熵 78 adden = entropy - ConEntropy; 79 AddEntropy.append(adden); 80 81 index = np.argmax(AddEntropy); 82 return index; 83 84 85 def VoteClass(self,classlist): 86 """ 87 当特征被选完,但还是无法准确判断哪一类时,采用投票的方式确定其类 88 """ 89 classlabel = list(set(classlist)); 90 dic = {}; 91 for c in classlabel: 92 if(c not in dic.keys()): 93 dic[c] = 0; 94 else: 95 dic[c] += 1; 96 return max(dic); 97 98 def GetDicTree(self,TrainData,features): 99 """ 100 构造字典树 101 TrainData 训练数据集 102 """ 103 classlabel = list(set(TrainData[:,-1])); #获得数据集的类别标签 104 #classlabel = [row[-1] for row in TrainData]; 105 106 if(len(classlabel) == 1): 107 return classlabel[0]; 108 109 if(len(TrainData[0]) == 1): 110 return self.VoteClass(TrainData[:,-1]); 111 112 bestfeature_index = self.SelectBestFeature(TrainData); 113 bestfeature = features[bestfeature_index]; #选出最优的特征 114 dictree = {bestfeature:{}}; #以最优特征为节点构建子树 115 del(features[bestfeature_index]) #删除已选过的特征 116 117 #根据最优特征的取值拆分数据集,递归上述选最优特征过程 118 feature_attr = list(set(TrainData[:,bestfeature_index])); 119 for value in feature_attr: 120 sub_features = features[:]; 121 subdata = self.SplitData(TrainData,bestfeature_index,value); 122 dictree[bestfeature][value] = self.GetDicTree(subdata,sub_features); 123 124 return dictree; 125 126 def SplitData(self,Data,feature_index,feature_value): 127 subdata = Data[np.where(Data[:,feature_index] == feature_value)]; 128 n = len(Data[0]); 129 subdata = [[row[i] for i in range(n) if i != feature_index] for row in subdata]; 130 return np.array(subdata); 131 132 133 134 135
该算法基本实现了运用信息增益实现了决策树,将选取最优的特征算法函数改为信息增益率就实现了C4.5算法,该算法还有剪枝操作没有实现,要是有能实现了可以交流一下。下面给出测试代码。
# -*- coding: utf-8 -*- """ Created on Thu Jan 25 16:55:25 2018 @author: markli """ import numpy as np; from DecisionTree_InformationAdd import DecisionTree; tree = DecisionTree(['no surfaceing','flippers']); TrainData = np.array([[1,1,'yes'], [1,1,'yes'], [1,0,'no'], [0,1,'no'], [0,1,'no']]); print(tree.fit(TrainData));
{'no surfaceing': {'1': {'flippers': {'1': 'yes', '0': 'no'}}, '0': 'no'}}