首页 > 技术文章 > Python3 决策树ID3算法实现

FightLi 2018-01-25 19:20 原文

  1 # -*- coding: utf-8 -*-
  2 """
  3 Created on Wed Jan 24 19:01:40 2018
  4 
  5 @author: markli
  6 
  7 采用信息增益作为特征选择原则构建决策树
  8 """
  9 import numpy as np;
 10 import pandas as pd;
 11 
 12 class DecisionTree(object):
 13     def __init__(self,features):
 14         """
 15         features 样本具有的特征的取值范围,例如具有A1,A2两个特征,features=[A1,A2]
 16         """
 17         self.features = features;
 18     
 19     def fit(self,TrainData):
 20         """
 21         TrainData 训练样本,数据格式为二维数组m*n
 22         m 个样本,n-1 个特征,最后一列是类别标签
 23         """
 24         tree = self.GetDicTree(TrainData,self.features);
 25         return tree;
 26         
 27         
 28     def SetEntropy(self,Data):
 29         """
 30         获取数据集的经验熵
 31         Data 数据集的最后一列,类别
 32         """
 33         N = len(Data[:,-1]); #获取数据集的大小
 34         Numoflabel = pd.value_counts(Data[:,-1]); #得数据集中到每一类别的数量
 35         classlabel = list(set(Data[:,-1]));
 36         entropy = 0;# 数据集的信息熵
 37         for c in classlabel:
 38             try:
 39                 Ck = Numoflabel[c]; #得到类别为c的样例的数量
 40                 entropy = entropy - Ck/N * np.log2(Ck/N);
 41             except KeyError:
 42                 Ck = 0;
 43                 entropy = entropy - Ck;
 44             
 45         return entropy;
 46     
 47     def ConditionEntropy(self,Data,index):
 48         """
 49         获取某一特征的条件经验熵
 50         Data 数据集与TrainData格式一致
 51         feature 特征的取值范围 例如 A1=[1,2,3]
 52         feature_index 该特征在数据集中属于第几个特征,从0开始
 53         """
 54         ConEntropy = 1;
 55         feature_value = list(set(Data[:,index]));
 56         N = len(Data[:,0]);
 57         for a in feature_value:
 58             d = Data[np.where(Data[:,index]==a)];
 59             d_n = len(d);
 60             if(d_n == 0):
 61                 return 0;
 62             #计算特征取a值时的数据集的经验熵
 63             d_entropy = self.SetEntropy(d);
 64             ConEntropy = ConEntropy * (d_n / N) * d_entropy;
 65         
 66         return -ConEntropy;
 67     
 68     def SelectBestFeature(self,Data):
 69         """
 70         选出数据集中最大信息增益的特征及最优特征
 71         Data 数据集与TrainData格式一致
 72         """
 73         AddEntropy = [];
 74         entropy = self.SetEntropy(Data); #求得数据集的经验熵
 75         feature_num = len(Data[0])-1; #获得数据集中特征数量
 76         for i in range(feature_num):
 77             ConEntropy = self.ConditionEntropy(Data,i); #求得每个特征的条件熵
 78             adden = entropy - ConEntropy;
 79             AddEntropy.append(adden);
 80         
 81         index = np.argmax(AddEntropy);
 82         return index;
 83       
 84     
 85     def VoteClass(self,classlist):
 86         """
 87         当特征被选完,但还是无法准确判断哪一类时,采用投票的方式确定其类
 88         """
 89         classlabel = list(set(classlist));
 90         dic = {};
 91         for c in classlabel:
 92             if(c not in dic.keys()):
 93                 dic[c] = 0;
 94             else:
 95                 dic[c] += 1;
 96         return max(dic);
 97     
 98     def GetDicTree(self,TrainData,features):
 99         """
100         构造字典树
101         TrainData 训练数据集
102         """
103         classlabel = list(set(TrainData[:,-1])); #获得数据集的类别标签
104         #classlabel = [row[-1] for row in TrainData];
105         
106         if(len(classlabel) == 1):
107             return classlabel[0];
108         
109         if(len(TrainData[0]) == 1):
110             return self.VoteClass(TrainData[:,-1]);
111         
112         bestfeature_index = self.SelectBestFeature(TrainData);
113         bestfeature = features[bestfeature_index]; #选出最优的特征
114         dictree = {bestfeature:{}}; #以最优特征为节点构建子树
115         del(features[bestfeature_index]) #删除已选过的特征
116         
117         #根据最优特征的取值拆分数据集,递归上述选最优特征过程
118         feature_attr = list(set(TrainData[:,bestfeature_index]));
119         for value in feature_attr:
120             sub_features = features[:];
121             subdata = self.SplitData(TrainData,bestfeature_index,value);
122             dictree[bestfeature][value] = self.GetDicTree(subdata,sub_features);
123             
124         return dictree;
125     
126     def SplitData(self,Data,feature_index,feature_value):
127         subdata = Data[np.where(Data[:,feature_index] == feature_value)];
128         n = len(Data[0]);
129         subdata = [[row[i] for i in range(n) if i != feature_index] for row in subdata];
130         return np.array(subdata);
131             
132         
133            
134             
135             

该算法基本实现了运用信息增益实现了决策树,将选取最优的特征算法函数改为信息增益率就实现了C4.5算法,该算法还有剪枝操作没有实现,要是有能实现了可以交流一下。下面给出测试代码。

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 25 16:55:25 2018

@author: markli
"""
import numpy as np;
from DecisionTree_InformationAdd import DecisionTree;


tree = DecisionTree(['no surfaceing','flippers']);
TrainData = np.array([[1,1,'yes'],
                      [1,1,'yes'],
                      [1,0,'no'],
                      [0,1,'no'],
                      [0,1,'no']]);
print(tree.fit(TrainData));
{'no surfaceing': {'1': {'flippers': {'1': 'yes', '0': 'no'}}, '0': 'no'}}

 

推荐阅读