python-重点工具(Pandas数据结构)

本文最后更新于:July 14, 2022 pm

Pandas数据结构Series:索引

位置下拉 / 标签索引 / 切片索引 / 布尔型索引
1
2
import pandas  as pd
import numpy as np
1
2
3
4
5
6
#位置下拉
s = pd.Series(np.random.rand(5))
print(s)
print(s[0],type(s[0]),s[0].dtype)
print(float(s[0]),type(float(s[0])))
#print(s[-1])-------报错
1
2
3
4
5
#下标索引

s = pd.Series(np.random.rand(10))
print(s)
print(s[0],type(s[0]),s[0].dtype)
1
2
3
4
5
6
7
# 标签索引

s = pd.Series(np.random.rand(5),index = ['a','b','c','d','e'])
print(s)
print(s['a'],type(s['a']),s['a'].dtype)

print(s[['a','c']]) #新的数组
1
2
3
4
5
6
7
8
9
10
11
#切片索引

s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5),index = ['a','b','c','d','e'])
print(s1)
print(s2)
print(s1[1:4],s1[4]) #下标
print(s2['a':'d'],s2['c']) #标签

print(s2[:-1])
print(s2[::2])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#巴布型索引

s = pd.Series(np.random.rand(3)*100)
s[4] = None
print(s)
bs1 = s>50
bs2 = s.isnull()
bs3 = s.notnull()
print(bs1,type(bs1),bs1.dtype)
print(bs2,type(bs2),bs2.dtype)
print(bs3,type(bs3),bs3.dtype)

print(s[s >50])
print(s[bs1])

Pandas数据结构Series

数据查看 / 重新索引 / 对齐 / 添加、修改、删除值
1
2
3
4
5
6
7
8
#数据查看

s = pd.Series(np.random.rand(50))
print(s.head(10))
print(s.tail())
#.head()查看头部数据
#.tall()查看尾部数据
#默认查看5条
1
2
3
4
5
6
7
8
9
10
#  重新索引  reindex

s = pd.Series(np.random.rand(5),index = ['a','b','c','d','e'])
s1 = s.reindex(['a','d','a','f'])
print(s)
print(s1)
print(s.reindex(['c','d','a','f']))

s2 = s.reindex(['c','d','a','f','ss'],fill_value = 0) #不知数全部用0来填充
print(s2)
1
2
3
4
5
6
7
# Series对齐

s1 = pd.Series(np.random.rand(3),index = ['Jack','Mary','Tom'])
s2 = pd.Series(np.random.rand(3),index = ['wang','Mary','Mary'])
print(s1)
print(s2)
print(s1+s2)
1
2
3
4
5
6
7
8
9
#  删除:  drop

s = pd.Series(np.random.rand(5),index = list('nkswm'))
print(s)
s1 = s.drop('n')
s2 = s.drop(['w','s'])
print(s1)
print(s2)
print(s)
1
2
3
4
5
6
#  添加  (原数据不变)

s = pd.Series(np.random.rand(5),index = list('nkswm'))
s3 = s.append(s)
print(s3)
print(s)

数据结构Dataframe

1
2
3
4
5
6
7
8
data = {'name':['Jack','Tome','Mary'],
'age':[18,19,20],
'gender':['m','n','v']}
frame = pd.DataFrame(data)
print(frame)
print(type(frame))

print(frame.values)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#  Dataframe的创建

from pandas import Series
import pandas as pd
import numpy as np

data1 = {'a':[1,2,3],
'b':[3,4,5],
'c':[5,6,7]}
data2 = {'one':np.random.rand(3 ), #字典的长度必须一样
'two':np.random.rand(3)}
print(data1)
print(data2)
d1 = pd.DataFrame(data1)
d2 = pd.DataFrame(data2)
print(d1)
print(d2)

df1 = pd.DataFrame(data1, columns = ['a','b','a','d']) #columns重新指定列,以及列的名称
print(df1)
df1 = pd.DataFrame(data1, columns = ['b','c'])
print(df1)

df2 = pd.DataFrame(data1, index = ['f1','f2','f3'])
print(df2)
1
2
3
4
5
6
7
8
9
10
11
12
data1 = {'one':pd.Series(np.random.rand(2)),         
'two':pd.Series(np.random.rand(3))}
data2 = {'one':pd.Series(np.random.rand(2),index = ['a','b']), #Series不用数量一样
'two':pd.Series(np.random.rand(3),index = ['a','b','c'])}
print(data1)
print(data2)

d1 = pd.DataFrame(data1)
d2 = pd.DataFrame(data2)
print(d1)

print(d2)
1
2
3
4
5
6
7
8
#   3、通过二维数组直接创建  -----

ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(data1, index = ['a','b','a'],columns = ['one','two','three']) #index和columns长度必须一样
print(df1)
print(df2)
1
2
3
4
5
6
7
8
9
#   4、 由字典组成的列表-----每一个字典都是一行
data = [{'one':1,'two':2},{'one':5,'two':8,'three':10}]
print(data)
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index = ['a','b'])
df3 = pd.DataFrame(data,columns = ['one','two'])
print(df1)
print(df2)
print(df3)
1
2
3
4
5
6
#   5、 由字典组成的字典
data = {'Jack':{'math':99,'english':90,'art':93},
'Mary':{'math':98,'english':97,'art':95},
'Tome':{'math':66,'english':78,'art':90},}
df1 = pd.DataFrame(data)
print(df1)

数据结构DataFrame:索引

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#选择行与列

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
print(df)

data1 = df['a']
data2 = df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))


data3 = df.loc['one']
data4 = df.loc[['one','two']] #包含末端
print(data3,type(data3))
print(data4,type(data4))

data5 = df[:2] #df[]中间如果是数字的话就默认为行
print(data5)
1
2
3
4
5
6
7
8
9
10
11
12
#.iloc[]---按照整数位置----行

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)

print(df.iloc[0])
print(df.iloc[-1])
print(df.iloc[0,2])
print(df.iloc[3,2])
#print(df.iloc[4]) -------不能超过最大长度,对多到达(length-1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#布尔型索引################

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])

print(df)

b1 = df<20
print(b1,type(b1))
print(df[b1]) #保留正确的值

b2 = df.loc[['one','three']]<50
print(b2,type(b2))
print(df[b2])
1
2
3
4
5
6
7
8
9
10
11
12
13
#   同时索引行和列

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)

print(df['a'].loc[['three','four']])
print('----')

print(df[['a','c','d']].iloc[::2])
print('-----')
print(df[df<50].loc[['one','two']])

数据结构DataFrame: 基本技巧

1
2
3
4
5
6
7
8
#数据查看转置

df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100)
print(df)
print(df.head(3)) #查看头部数据
print(df.tail(1)) #查看尾部数据

print(df.T) #转置
1
2
3
4
5
6
7
8
9
10
11
12
13
#添加与修改

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)

df['e'] = 10
df.loc[4] = 20
print(df)

df['e'] = 20
df[['a','c']] = 100
print(df)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#删除  del  /drop()

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)

del df['a'] #原数据改变
print(df)
print('-----')

print(df.drop(0))
print(df.drop([1,2]))
print(df)
print('-----')

print(df.drop(['d'],axis = 1)) #原数据不改变##########
print(df)
1
2
3
4
5
6
7
8
#   对齐
df1 = pd.DataFrame(np.random.rand(10,4),columns = ['A','B','C','D'])
df2 = pd.DataFrame(np.random.rand(7,3),columns = ['A','B','C'])

print(df1)
print(df2)
print('-----')
print(df1+df2) #对应数据对齐相加
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#排序1 - 按值排序 .sort_values

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_values(['a'],ascending = True)) #升序
print(df1.sort_values(['a'],ascending = False)) #降序
print('----')

df2 = pd.DataFrame({'a':[1,1,1,1,1,3,3,5],
'b':list(range(8)),
'c':list(range(8,0,-1))})
print(df2)
print(df2.sort_values(['a','c']))
1
2
3
4
5
6
7
8
9
10
11
#  排序2 - 索引排序 .sort_index
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = [5,4,3,7],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['h','s','x','g'],
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_index())
print(df2)
print(df2.sort_index())

时间模块

1
2
3
4
5
6
7
8
9
10
#   datetime.date:  data对象

import datetime

today = datetime.date.today() #返回今日
print(today,type(today))
print(str(today),type(str(today)))

t = datetime.date(2016,6,1) #直接得到当时日期
print(t)
1
2
3
4
5
#   datetime.datetime:  datetime对象

now = datetime.datetime.now()
print(now,type(now))
print(str(now),type(str(now))) #转化为字符串
1
2
3
4
5
#  datetime.date.timedelta时间差

t1 = datetime.datetime(2000,10,1)
tx = datetime.timedelta(100,3600) #3600是秒
print(t1+tx)
1
2
3
4
5
6
7
8
#  字符串转换方法 parser.parse

from dateutil.parser import parse

data = '2/27/2021'
data1 = '27/2/2021'
print(parse(data),type(parse(data)))
print(parse(data1))

时刻数据

1
2
3
4
5
6
7
8
9
#pd.Timestamp() -----单个数据

date1 = datetime.datetime(2021,2,27,12,45,30)
date2 = '2021-2-27' #创建一个字符串
t1 = pd.Timestamp(date1)
t2 = pd.Timestamp(date2)
print(t1,type(t1))
print(t2)
print(pd.Timestamp('2021-2-27 12:45:30'))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#pd.to_datetime-------大量的时间字符串

from datetime import datetime

date1 = datetime(2021,2,27,12,45,30)
date2 = '2021-2-27'
t1 = pd.to_datetime(date1)
t2 = pd.to_datetime(date1)

print(t1)
print(t2)

lst_date = ['2021-2-12','2021-4-6','2021-6-6'] #可以是多个数据
t3 = pd.to_datetime(lst_date)
print(t3,type(t3))

date3 = ['2021-2-12','2021-4-6','2021-6-6','hahahah']
print(pd.to_datetime(date3,errors='ignore'),type(pd.to_datetime(date3,errors='ignore')))
print(pd.to_datetime(date3,errors='coerce'))

时间戳索引

1
2
3
4
5
6
7
8
9
# pd.DatetimeIndex()与TimeSeries时间序列

rng = pd.DatetimeIndex(['12/1/2017','12/2/2017','12/3/2017','12/4/2017','12/5/2017'])
print(rng,type(rng))
print(rng[0],type(rng[0]))

st = pd.Series(np.random.rand(len(rng)),index = rng)
print(st,type(st))
print(st.index)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# pd.date_range()-日期范围:生成日期范围
# 1、start + end 2、start/end + periods
#默认频率是 天

rng1 = pd.date_range('1/1/2017','1/10/2017',normalize = True)
rng2 = pd.date_range(start = '1/2/2017',periods = 10)
rng3 = pd.date_range(end = '1/30/2017 15:00:00',periods = 10)
print(rng1,type(rng1))
print(rng2,type(rng2))
print(rng3,type(rng3))
print('-----')

rng4 = pd.date_range(start = '1/1/2017 15:30',periods = 10,name = 'hello world',normalize = True)
print(rng4)
print('----')

print(pd.date_range('1/1/2017','1/10/2017'))
print(pd.date_range('1/1/2017','1/10/2017',closed = 'right'))
print(pd.date_range('1/1/2017','1/10/2017',closed = 'left'))
print('----')

print(pd.bdate_range('20210101','20210107')) #默认频率是工作日

print(pd.date_range(start = '1/10/2017',periods = 10)) #直接转化为list
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# pd.date_range()-日期范围:生成日期范围--频率()
import pandas as pd
import numpy as np

print(pd.date_range('1/1/2017','1/10/2017')) #日历日
print(pd.date_range('1/1/2017','1/10/2017',freq = 'B')) #工作日
print(pd.date_range('1/1/2017','1/10/2017',freq = 'H')) #每小时
print(pd.date_range('1/1/2017 12:00','1/1/2017 12:10',freq = 'T')) #每分
print(pd.date_range('1/1/2017 12:00:00','1/1/2017 12:10:10',freq = 'S')) #每秒
print(pd.date_range('1/1/2017 12:00:00','1/1/2017 12:10:10',freq = 'L')) #每毫秒
print(pd.date_range('1/1/2017 12:00:00','1/1/2017 12:10:10',freq = 'U')) #每微秒

print(pd.date_range('1/1/2017 12:00','1/10/2017 12:10',freq = 'W-MON')) #指定每周星期几开始

print(pd.date_range('1/1/2017 12:00','1/10/2017 12:10',freq = 'WOM-2MON')) #每月的第几个星期几开始
1
2
3
4
5
## pd.date_range()-日期范围:复合频率

print(pd.date_range('1/1/2017','2/1/2017',freq = '7D')) #7天
print(pd.date_range('1/1/2017','1/2/2017',freq = '2h30min'))
print(pd.date_range('2017','2018',freq = '2M'))
1
2
3
4
5
6
#  时期频率转换;asfreq

ts = pd.Series(np.random.rand(4),
index = pd.date_range('20170101','20170104'))
print(ts)
print(ts.asfreq('4H',method = 'ffill'))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#  超前和滞后

ts = pd.Series(np.random.rand(4),
index = pd.date_range('20170101','20170104'))
print(ts)

print(ts.shift(2)) #数值后移
print(ts.shift(-2)) #数值前移

per = ts/ts.shift(1) - 1
print(per)
print('---')

print(ts.shift(2,freq = 'D'))
print(ts.shift(2,freq = 'T'))

时期

1
2
3
4
5
6
7
8
# pd.Period()创建时期

p = pd.Period('2017',freq = 'M')
print(p,type(p))

print(p + 1)
print(p - 2)
print(pd.Period('2012',freq = 'A-DEC')-1)
1
2
3
4
5
6
7
8
9
p =  pd.period_range('1/1/2021','1/1/2022',freq = 'M')
q = pd.date_range('1/1/2021','1/1/2022',freq = 'M')

ts1 = pd.Series(np.random.rand(len(p)),
index = p)
ts2 = pd.Series(np.random.rand(len(q)),
index = q)
print(ts1.index)
print(ts2.index)
1
2
3
4
5
6
#  asfreq: 频率转换

p = pd.Period('2020','A-DEC')
print(p)
print(p.asfreq('M',how = 'start'))
print(p.asfreq('D',how = 'end'))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#时间戳用于时期之间的转换: pd.to_Period()\pd.to_timestamp()

p = pd.date_range('1/1/2021',periods = 10,freq = 'M')
q = pd.period_range('2021','2022',freq = 'M')

ts1 = pd.Series(np.random.rand(len(p)),
index = p)
print(ts1.head())
print(ts1.to_period().head())
#每月最后一日转化为每月
ts2 = pd.Series(np.random.rand(len(q)),
index = q)
print(ts2.head())
print(ts2.to_timestamp().head())
#每月,转化为每月第一天

时间序列 - 索引及切片

1
2
3
4
5
6
7
8
9
10
#  索引
from datetime import datetime
p = pd.date_range('1/2021','3/2021')
ts = pd.Series(np.random.rand(len(p)),index = p)
print(ts.head())

print(ts[0])
print(ts[:2])
print('----')
print(ts['2021/2/28'])
1
2
3
4
5
6
7
8
# 切片

print(ts.head())
#print(ts[::2])
print(ts['20210101':'20210111'])
print('---')

print(ts['2021/01'])
1
2
3
4
5
6
7
8
9
10
#重复索引的时间序列

dates = pd.DatetimeIndex(['1/2/2015','1/3/2015','1/2/2015','2/4/2015','1/3/2015'])
ts = pd.Series(np.random.rand(5),index = dates)

print(ts)
print(ts.is_unique,ts.is_unique)
print('---')

print(ts.groupby(level=0).mean()) #合在一起求平均值

时间序列 - 重采样

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#重采样:  resample()

p = pd.date_range('1/2021',periods = 12)
ts = pd.Series(np.arange(12),index = p)
print(ts)

ts_re = ts.resample('5D')
ts_re2 = ts.resample('5D').sum()
print(ts_re,type(ts_re))
print(ts_re2,type(ts_re2))
print('----')

print(ts.resample('5D').mean())
print(ts.resample('5D').max())
print(ts.resample('5D').min())
print(ts.resample('5D').median())
print(ts.resample('5D').first())
print(ts.resample('5D').last())
print(ts.resample('5D').ohlc()) #OHLC重采样
1
2
3
4
5
6
7
8
9
10
11
12
13
#降采样   #33#########

p = pd.date_range('1/21/2021',periods = 12)
ts = pd.Series(np.arange(1,13),index = p)
print(ts)

print(ts.resample('5D').sum(),'→ 默认\n')
print(ts.resample('5D',closed = 'left').sum(),'→ left\n')
print(ts.resample('5D',closed = 'right').sum(),'→ right\n')
print('----')

print(ts.resample('5D',label = 'left').sum(),'→ leftlabel\n')
print(ts.resample('5D',label = 'right').sum(),'→ rightlabel\n')
1
2
3
4
5
6
7
8
9
10
11
#升采样及插值

import pandas as pd
import numpy as np
p = pd.date_range('1/1/2021 0:0:0',periods = 5,freq = 'H')
ts = pd.DataFrame(np.arange(15).reshape(5,3),index = p,columns = ['a','b','c'])
print(ts)

print(ts.resample('15T').asfreq())
print(ts.resample('15T').ffill())
print(ts.resample('15T').bfill())
1
2
3
4
5
6
7
8
#  时期重采样 - Period

p = pd.date_range('2021','2022',freq = 'M')
ts = pd.Series(np.arange(len(p)),index = p)
print(ts)

print(ts.resample('3M').sum())
print(ts.resample('15D').ffill())