首页 > 技术文章 > 贝叶斯分类

caicaihong 2016-08-13 19:58 原文

  1 # -*- coding: utf-8 -*-
  2 
  3 import sys
  4 import os
  5 import numpy as np
  6 import pickle
  7 from sklearn import metrics
  8 
  9 #导入数据集
 10 def loadDataSet():
 11     postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 12                    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 13                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him', 'my'],
 14                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
 15                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 16                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
 17     classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not,分类
 18     return postingList, classVec
 19 
 20 
 21 # 读取文件
 22 def readfile(path):
 23     fp = open(path, "rb")
 24     content = fp.read()
 25     fp.close()
 26     return content
 27 
 28 """
 29 
 30 #计算分类精度:
 31 def metrics_result(actual,predict):
 32     print('精度:{0:.3f}'.format(metrics.precision_score(actual,predict)))
 33     print ('召回:{0:0.3f}'.format(metrics.recall_score(actual,predict)))
 34     print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict)))
 35 
 36 """
 37 
 38 # 读取bunch对象
 39 def readbunchobj(path):
 40     file_obj = open(path, "rb")
 41     bunch = pickle.load(file_obj)
 42     file_obj.close()
 43     return bunch
 44 
 45 
 46 # 写入bunch对象
 47 def writebunchobj(path, bunchobj):
 48     file_obj = open(path, "wb")
 49     pickle.dump(bunchobj, file_obj)
 50     file_obj.close()
 51 
 52 
 53 class NBayes(object):
 54     def __init__(self):
 55         self.vocabulary = []  # 词典
 56         self.idf = 0  # 词典的idf权值向量
 57         self.tf = 0  # 训练集的权值矩阵
 58         self.tdm = 0  # P(x|yi)
 59         self.Pcates = {}  # P(yi)--是个类别字典,这个集合就是p(yi)的值的集合
 60         self.labels = []  # 对应每个文本的分类,是个外部导入的列表
 61         self.doclength = 0  # 训练集文本数
 62         self.vocablen = 0  # 词典词长
 63         self.testset = 0  # 测试集
 64 
 65     #    加载训练集并生成词典,以及tf, idf值
 66     def train_set(self, trainset, classVec):
 67         self.cate_prob(classVec)  # 计算每个分类在数据集中的概率:P(yi)
 68         self.doclength = len(trainset)
 69         tempset = set()
 70         [tempset.add(word) for doc in trainset for word in doc]  # 生成词典
 71         self.vocabulary = list(tempset)
 72         self.vocablen = len(self.vocabulary)
 73         self.calc_wordfreq(trainset)
 74         # self.calc_tfidf(trainset)  # 生成tf-idf权值
 75         self.build_tdm()  # 按分类累计向量空间的每维值:P(x|yi)
 76 
 77     # 生成 tf-idf
 78     def calc_tfidf(self, trainset):
 79         self.idf = np.zeros([1, self.vocablen])
 80         self.tf = np.zeros([self.doclength, self.vocablen])
 81         for indx in range(self.doclength):
 82             for word in trainset[indx]:
 83                 self.tf[indx, self.vocabulary.index(word)] += 1
 84             # 消除不同句长导致的偏差
 85             self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
 86             for signleword in set(trainset[indx]):
 87                 self.idf[0, self.vocabulary.index(signleword)] += 1
 88         self.idf = np.log(float(self.doclength) / self.idf)
 89         self.tf = np.multiply(self.tf, self.idf)  # 矩阵与向量的点乘
 90 
 91     # 生成普通的词频向量
 92     def calc_wordfreq(self, trainset):
 93         self.idf = np.zeros([1, self.vocablen])  # 1*词典数
 94         self.tf = np.zeros([self.doclength, self.vocablen])  # 训练集文件数*词典数
 95         for indx in range(self.doclength):  # 遍历所有的文本
 96             for word in trainset[indx]:  # 遍历文本中的每个词
 97                 self.tf[indx, self.vocabulary.index(word)] += 1  # 找到文本的词在字典中的位置+1
 98             for signleword in set(trainset[indx]):
 99                 self.idf[0, self.vocabulary.index(signleword)] += 1
100 
101     # 计算每个分类在数据集中的概率:P(yi)
102     def cate_prob(self, classVec):
103         self.labels = classVec#让分类作为相对应的标签
104         labeltemps = set(self.labels)  # 获取全部分类,返回的是一个集合,其值为{0,1}
105         #print('分类的结果:',labeltemps)
106         for labeltemp in labeltemps:
107             # 统计列表中重复的值:self.labels.count(labeltemp)
108             self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))#求分类列表中重复的值,就是0和1在所有当中所占的比例
109 
110     # 按分类累计向量空间的每维值:P(x|yi)
111     def build_tdm(self):
112         self.tdm = np.zeros([len(self.Pcates), self.vocablen])  # 类别行*词典列
113         sumlist = np.zeros([len(self.Pcates), 1])  # 统计每个分类的总值
114         for indx in range(self.doclength):
115             self.tdm[self.labels[indx]] += self.tf[indx]  # 将同一类别的词向量空间值加总
116             sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]])  # 统计每个分类的总值--是个标量
117         self.tdm = self.tdm / sumlist  # P(x|yi)
118 
119     # 测试集映射到当前词典
120     def map2vocab(self, testdata):
121         self.testset = np.zeros([1, self.vocablen])
122         for word in testdata:
123             self.testset[0, self.vocabulary.index(word)] += 1
124 
125     # 输出分类类别
126     def predict(self, testset):
127         if np.shape(testset)[1] != self.vocablen:
128             print("输入错误")
129             exit(0)
130         predvalue = 0
131         predclass = ""
132         for tdm_vect, keyclass in zip(self.tdm, self.Pcates):
133             # P(x|yi)P(yi)
134             temp = np.sum(testset * tdm_vect * self.Pcates[keyclass])
135             if temp > predvalue:
136                 predvalue = temp
137                 predclass = keyclass
138         return predclass

 算法的改进:

 # 生成 tf-idf
 78     def calc_tfidf(self, trainset):
 79         self.idf = np.zeros([1, self.vocablen])
 80         self.tf = np.zeros([self.doclength, self.vocablen])
 81         for indx in range(self.doclength):
 82             for word in trainset[indx]:
 83                 self.tf[indx, self.vocabulary.index(word)] += 1
 84             # 消除不同句长导致的偏差
 85             self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
 86             for signleword in set(trainset[indx]):
 87                 self.idf[0, self.vocabulary.index(signleword)] += 1
 88         self.idf = np.log(float(self.doclength) / self.idf)
 89         self.tf = np.multiply(self.tf, self.idf)  # 矩阵与向量的点乘

 

推荐阅读