python-重点工具(Pandas计算)

本文最后更新于:July 14, 2022 pm

数值计算和统计基础

1
2
3
4
5
6
7
8
9
10
11
12
13
#  基本参数: axis\\skipna

import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,6,np.nan,2],
'key2':[1,2,np.nan,4,5],
'key3':['a','b','c','d','e']})
print(df)

print(df.mean(),type(df.mean()))
print(df.mean(axis = 1))
print(df.mean(skipna=False)) #不过滤空值
1
2
3
4
5
6
7
8
df = pd.DataFrame({'key1':np.arange(10),
'key2':np.random.rand(10)*10})
print(df)
print('-----')
print(df.quantile(q=0.75)) #统计分位数,q值默认为0.5,也就是中值
print('----')
print(df.skew()) #样本的偏度
print(df.kurt()) #样本的峰度
1
2
3
4
5
6
7
8
9
10
df['key1_s'] = df['key1'].cumsum()   #累计求和
df['key2_s'] = df['key2'].cumsum()
print(df)
print('----')
df['key1_p'] = df['key1'].cumprod() #累计求积
df['key2_p'] = df['key2'].cumprod()
print(df)
print('---')

print(df.cummax(),'\n',df.cummin()) #分别求累积最大值和最小值
1
2
3
4
5
6
7
8
9
10
11
12
13
#唯一值:  .unique()

s = pd.Series(list('assadsadfkj'))
sq = s.unique()
print(s)
print(sq,type(sq))
print(pd.Series(sq))

sq.sort()
print(sq)

sc = s.value_counts(sort = False) #值出现频率的计数
print(sc)

文本数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#  通过str访问,且自动排除丢失和Na值

s = pd.Series(['A','B','C','HHUHDD','123',np.nan,'ju'])
df = pd.DataFrame({'key1':list('abcdef'),
'key2':['hee','fv','w','hije','123',np.nan]})
print(s)
print(df)
print('----')

print(s.str.count('B'))
print(df['key2'].str.upper())
print('---')

df.columns.str.upper()
print(df)
1
2
3
4
5
6
7
8
9
#字符串常用方法

s = pd.Series(['A','b','jjaj','123',np.nan])

print(s.str.lower(),'\n')
print(s.str.upper(),'\n')
print(s.str.len(),'\n')
print(s.str.startswith('b'),'\n')
print(s.str.endswith('3'),'\n')
1
2
3
4
5
6
7
8
#  replace()

df = pd.DataFrame(np.random.rand(3,2),columns=[' columns A',' column B'],index = range(3))
df.columns = df.columns.str.replace(' ','-')
print(df)

df.columns = df.columns.str.replace('-','hha',n=1)
print(df)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#   -split\\rsplit
import pandas as pd
import numpy as np

s = pd.Series(['a','b','c','1,2,3',['a','b'],np.nan])
print(s.str.split(','))
print('----')

print(s.str.split(',')[0])
print('---')
#直接索引得到一个list

print(s.str.split(',').str[0])
print(s.str.split(',').str.get(1)) # 访问被拆分的元素
print('---')

print(s.str.split(',',expand=True))
print(s.str.split(',',expand=True,n = 1)) #限制分割数
print(s.str.rsplit(',',expand=True,n =1)) # 字符串末尾到字符串开头
print('----')

df = pd.DataFrame({'key1':['a,b,c','1,2,3',[':,.,']],
'key2':['a-b-c','1-2-3',[':-.-']]})
print(df['key2'].str.split('-'))
#DataFrame使用split
1
2
3
4
5
6
7
8
9
#  字符串索引

s = pd.Series(['A','b','C','dsdscd','123',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
'key2':['hee','fv','w','jiij','123',np.nan]})

print(s.str[0]) #取第一个字符串
print(s.str[:2]) #取前两个字符串
print(df['key2'].str[0])

合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# merge合并

df1 = pd.DataFrame({'key':['KO','K1','K2','K3'],
'A':['AD','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
df2 = pd.DataFrame({'key':['KO','K1','K2','K3'],
'C':['CD','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(df1)
print(df2)
print(pd.merge(df1,df2,on='key')) #key按照参考键
print('----')

df3 = pd.DataFrame({'key1':['KO','K0','K1','K2'],
'key2':['KO','K1','K0','K1'],
'A':['AD','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
df4 = pd.DataFrame({'key1':['KO','K1','K1','K2'],
'key2':['KO','K0','K0','K0'],
'C':['CD','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(df3)
print(df4)
print(pd.merge(df3,df4,on=['key1','key2']))
1
2
3
4
print(pd.merge(df3,df4,on = ['key1','key2'],how = 'inner'))   #交集
print(pd.merge(df3,df4,on = ['key1','key2'],how = 'outer')) #并集
print(pd.merge(df3,df4,on = ['key1','key2'],how = 'left')) #以df3合并为主
print(pd.merge(df3,df4,on = ['key1','key2'],how = 'right'))

连接与补修

1
2
import pandas  as pd
import numpy as np
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#  连接:concat

s1 = pd.Series([1,2,3])
s2 = pd.Series([2,3,4])
s3= pd.Series([1,2,3],index = ['a','b','c'])
s4= pd.Series([2,3,4],index = ['d','e','f'])
print(pd.concat([s1,s2]))
print(pd.concat([s3,s4]).sort_index())
print('----')
#默认axis=0,行 + 行。

print(pd.concat([s3,s4],axis=1))
print('----')
#axis=1,列+列,成为一个DataFrame
1
2
3
4
5
6
7
# 连接方式:  join,join_axes

s5= pd.Series([1,2,3],index = ['a','b','c'])
s6= pd.Series([2,3,4],index = ['b','c','d'])
print(pd.concat([s5,s6],axis=1)) #默认是并集
print(pd.concat([s5,s6],axis=1,join='inner')) #inner为交集
print(pd.concat([s5,s6],axis=1,join_axes=[['a','b','d']])) #join_axes为指定联合的index
1
2
print(pd.concat([s5,s6],keys=['one','two']))
print(pd.concat([s5,s6],axis=1,keys=['one','two']))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 修补pd.combine_first()

df1 = pd.DataFrame([[np.nan,3,5],[-4,6,np.nan,np.nan],[np.nan,7,np.nan]])
df2 = pd.DataFrame([[-42.6,np.nan,-8,2],[-5,1,6,4]],index = [1,2])
print(df1)
print(df2)
print(df1.combine_first(df2))
print('----')
#根据index,df1的空值被df2代替
#如果df2的index多于df1,则更新到df1上

df1.updata(df2)
print(df1)
#updata,直接df2覆盖df1,相同index的位置