首页 > 技术文章 > pandas的使用

SpringDays 2020-01-11 12:15 原文

数据结构 Series , DataFrame

Series是定长的字典序列

series在存储时,相当于两个ndarray,定长是它与不定长的字典结构的主要区别
两个基本属性,index和values,index默认是0,1,2,3递增,也可以指定索引,如 index=['x','y','z']

import pandas as pd
from pandas import Series, DataFrame
x1 = Series([7, 8, 9])
x2 = Series(data=[7, 8, 9], index=['x', 'y', 'z'])
dict = {'x': 7, 'y': 8, 'z': 9} #利用字典构建series
x3 = Series(dict)
print(x1)
print(x2)
print(x3)
0    7
1    8
2    9
dtype: int64
x    7
y    8
z    9
dtype: int64
x    7
y    8
z    9
dtype: int64

DataFrame 数据结构类似数据库表

包含行索引和列索引,看成是由相同索引的Series组成的字典类型

import pandas as pd 
from pandas import Series,DataFrame
data = {'English':[55,23,78,34],'Math':[23,88,45,75],'Physical':[10,44,57,88]}
df1 = DataFrame(data)
df2 = DataFrame(data=data,index=['ZhangSan','LiSi','WangWu','ZhaoLiu'])
print (df1)
print (df2)
   English  Math  Physical
0       55    23        10
1       23    88        44
2       78    45        57
3       34    75        88
          English  Math  Physical
ZhangSan       55    23        10
LiSi           23    88        44
WangWu         78    45        57
ZhaoLiu        34    75        88

数据清洗

删除不必要的行或列 - drop

import pandas as pd 
from pandas import Series,DataFrame

data = {'English':[55,23,78,34],'Math':[23,88,45,75],'Physical':[10,44,57,88]}
df = DataFrame(data=data,index=['ZhangSan','LiSi','WangWu','ZhaoLiu'])
df = df.drop(columns=['English'])
df = df.drop(index=["LiSi"])
print (df)
          Math  Physical
ZhangSan    23        10
LiSi        88        44
WangWu      45        57
ZhaoLiu     75        88

重命名列名 - rename(columns=new_names,inplace=True)

import pandas as pd
from pandas import Series, DataFrame

data = {'English': [55, 23, 78, 34], 'Math': [
    23, 88, 45, 75], 'Physical': [10, 44, 57, 88]}
df = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'ZhaoLiu'])
print('before:')
print(df)
df.rename(columns={'English': 'yingyu', 'Physical': 'wulil'},
          inplace=True)  # inplace 表示是否在原对象上修改
print('\r\nrename after:')
print(df)
before:
          English  Math  Physical
ZhangSan       55    23        10
LiSi           23    88        44
WangWu         78    45        57
ZhaoLiu        34    75        88

rename after:
          yingyu  Math  wulil
ZhangSan      55    23     10
LiSi          23    88     44
WangWu        78    45     57
ZhaoLiu       34    75     88

去重重复行 - drop_duplicates

import pandas as pd
from pandas import Series, DataFrame

data = {'English': [55, 23, 23, 34], 'Math': [55, 88, 88, 55], 'Physical': [10, 44, 44, 55]}
df = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'ZhaoLiu'])
print('before:')
print(df)

delrow_df = df.drop_duplicates(inplace=False)
print('\r\ndrop_duplicates rows after:')
print(delrow_df)

# subset 考虑重复发生在哪一列, keep 包含三个参数 first(默认) last False
delrow_df = df.drop_duplicates(subset=['Math'],keep='first',inplace=False)
print('\r\ndrop_duplicates rows after:')
print(delrow_df)
before:
          English  Math  Physical
ZhangSan       55    55        10
LiSi           23    88        44
WangWu         23    88        44
ZhaoLiu        34    55        55

drop_duplicates rows after:
          English  Math  Physical
ZhangSan       55    55        10
LiSi           23    88        44
ZhaoLiu        34    55        55

drop_duplicates rows after:
          English  Math  Physical
ZhangSan       55    55        10
LiSi           23    88        44

更改数据格式

1.更改数据类型 - astype
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

data = {'English': [55, 23, 23, 34], 'Math': [55, 88, 88, 55], 'Physical': [10, 44, 44, 55]}
df = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'ZhaoLiu'])

df['English'].astype('str') 
df['English'].astype(np.int64) 
ZhangSan    55
LiSi        23
WangWu      23
ZhaoLiu     34
Name: English, dtype: int64
2.去除空格,大小写处理 - strip,upper,lower,title (python中的字符串格式化函数)
import pandas as pd
from pandas import Series, DataFrame

data = {'English': ['55$', '23', '23', '34'], 'Math': ['5a', '88', '8t', '55'], 'Physical': ['$10', '4n', '44$', 'a5']}
df = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'ZhaoLiu'])

# Physical列去掉特殊符号
df['Physical']=df['Physical'].astype('str').str.strip('$')
print(df)
#全部大写
df.columns = df.columns.str.upper()
print(df)
#全部小写
df.columns = df.columns.str.lower()
print(df)
#首字母大写
df.columns = df.columns.str.title()
print(df)
         English Math Physical
ZhangSan     55$   5a       10
LiSi          23   88       4n
WangWu        23   8t       44
ZhaoLiu       34   55       a5
         ENGLISH MATH PHYSICAL
ZhangSan     55$   5a       10
LiSi          23   88       4n
WangWu        23   8t       44
ZhaoLiu       34   55       a5
         english math physical
ZhangSan     55$   5a       10
LiSi          23   88       4n
WangWu        23   8t       44
ZhaoLiu       34   55       a5
         English Math Physical
ZhangSan     55$   5a       10
LiSi          23   88       4n
WangWu        23   8t       44
ZhaoLiu       34   55       a5

想到一个不错的字符串处理办法,clean_strings的复用性更高

def remove_punctuation(value):
    return re.sub('[!#?]', '', value)

clean_ops = [str.strip, remove_punctuation, str.title, str.lower,str.upper]

def clean_strings(strings, ops):
    result = []
    for value in strings:
        for function in ops:
            value = function(value)
        result.append(value)
    return result
空值查找 - isnull
import pandas as pd
from pandas import Series, DataFrame

data = {'English': ['55$', '23', '23', '34'], 'Math': ['5a', '88', '8t',None ], 'Physical': ['$10', '4n', '44$', 'a5']}
df = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'ZhaoLiu'])

df.isnull()
df.isnull().any()
English     False
Math         True
Physical    False
dtype: bool

自由度高的清洗函数 apply (我们可以自定义函数,达到灵活处理字符串的目的)

import pandas as pd
from pandas import Series, DataFrame

data = {'English': [55, 23, 23, 34], 'Math': [55, 88, 88, 55], 'Physical': [10, 44, 44, 55]}
df = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'ZhaoLiu'])

def plus(df,n,m): 
    df['c1'] = (df['Math']+df['Physical']) - m 
    df['c2'] = (df['Math']+df['Physical']) - n 
    return df

df = df.apply(plus,axis=1,args=(2,3,))
print (df)
          English  Math  Physical   c1   c2
ZhangSan       55    55        10   62   63
LiSi           23    88        44  129  130
WangWu         23    88        44  129  130
ZhaoLiu        34    55        55  107  108

数据统计

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

data = {'English': [55, 23, 23, 34], 'Math': [55, 88, 88, 55], 'Physical': [10, 44, 44, 55]}
df = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'ZhaoLiu'])

#count()统计个数,空值NaN不计算
df.count()
# describe()一次性输出多个统计结果 ,包括count,mean,std,min,max等
df.describe()
#min()最小值
df.min()
#max()最大值
df.max()
#sum() 求总和
df.sum()
#mean()平均值
df.mean()
#median()中位数
df.median()
#var()方差
df.var()
#std()标准差
df.std()
#argmin() 统计最小值的索引位置
np.argmin(df)
2

常用统计函数

数据表合并

import pandas as pd
from pandas import Series, DataFrame

df1 = DataFrame({'name':['ZhangSan', 'LiSi', 'a', 'b', 'c'], 'c1':range(5)})
df2 = DataFrame({'name':['ZhangSan', 'LiSi', 'A', 'B', 'C'], 'c2':range(5)})

#指定列连接
df3 = pd.merge(df1,df2,on='name')
print(df3)

#内连接
df4 = pd.merge(df1,df2,how='inner')
print(df4)

#外连接
df5 = pd.merge(df1,df2,how='outer')
print(df5)

#左连接
df6 = pd.merge(df1,df2,how='left')
print(df6)

#右连接
df7 = pd.merge(df1,df2,how='right')
print(df7)
       name  c1  c2
0  ZhangSan   0   0
1      LiSi   1   1
       name  c1  c2
0  ZhangSan   0   0
1      LiSi   1   1
       name   c1   c2
0  ZhangSan  0.0  0.0
1      LiSi  1.0  1.0
2         a  2.0  NaN
3         b  3.0  NaN
4         c  4.0  NaN
5         A  NaN  2.0
6         B  NaN  3.0
7         C  NaN  4.0
       name  c1   c2
0  ZhangSan   0  0.0
1      LiSi   1  1.0
2         a   2  NaN
3         b   3  NaN
4         c   4  NaN
       name   c1  c2
0  ZhangSan  0.0   0
1      LiSi  1.0   1
2         A  NaN   2
3         B  NaN   3
4         C  NaN   4

用SQL方式操作Pandas

可以使用pandasql 来使用我们熟悉的sql语句来操作pandas,sqldf是主要函数,两个参数,其一是sql语句,其二是globals() 或 locals(),表示是否使用全局变量

import pandas as pd
from pandas import DataFrame
from pandasql import sqldf, load_meat, load_births

df1 = DataFrame({'name':['ZhangSan', 'LiSi', 'a', 'b', 'c'], 'c1':range(5)})
pysqldf = lambda sql: sqldf(sql, globals()) #定义一个匿名函数
sql = "select * from df1 where name ='ZhangSan'"
print (pysqldf(sql))
       name  c1
0  ZhangSan   0

推荐阅读