首页 > 技术文章 > 数据离散化和归一化

xingnie 2019-01-29 16:50 原文

数据离散化和归一化

在进行数据分析时,通常需要对数据进行归一化和离散化的操作

from pylab import *
from numpy import *
import codecs
import matplotlib.pyplot as plt
import operator                                      #新加了一个库,用于排序
import pandas as pd
from numpy.random import random
from sklearn import preprocessing 


url = "resultData.txt"
nmi_all=[]                                           #存储所有的互信息的值
data_number = 0                                      #用于计数
FeatureNum=6                                            #定义待读取数据的特征数量
data_num = 100                                         #一百条数据
data = []
def open_file(url):   
    with codecs.open(url, "r") as f:
        tmp = []
        for line in f.readlines():
            line1=line.strip()
            line2=line1.split(',')
            for i in range(0, FeatureNum):
                tmp.append(float(line2[i]))
            data.append(tmp)
            tmp = []
        datas = array(data)

def gui_yi_hua(data):
    min_max_scaler = preprocessing.MinMaxScaler()  
    tseg_minMax = min_max_scaler.fit_transform(data)
    return(tseg_minMax)
    #tseg_out = pd.DataFrame(tseg_minMax)
    #tseg_out.to_csv('tseg_out.csv')

def arry_discretization(tseg_minMax):
    for tmp in tseg_minMax:
        print(tmp)
        ages=tmp
        bins = [0,0.25,0.5,0.75,1]
        group_names=['这个属于0-0.25','这个属于0.25-0.5','这个属于0.5-0.75','这个属于0.75-1']
        cuts=pd.cut(ages,bins,labels=group_names)
        print(cuts)
        print(pd.value_counts(cuts))
    
    
    
if __name__ == '__main__':
    open_file(url)
    arry_discretization(gui_yi_hua(data))
View Code

 

推荐阅读