本文最后更新于：July 15, 2022 am

21年全国大学生计算机应用能力与信息素养大赛

注：此代码实现过程仅是本人初学后所敲，很多地方不足，仅供大家参考。

因为大赛结束后就没有赛题及相关介绍，这里提供给大家

代码如下：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data =  pd.read_csv('C:/Users/86155/Desktop/大数据竞赛数据集_本科组.csv')
data.head()

	X0	X1	X2	X3	X4	X5	X6	X7	X8	X9	X13	X14
0	1/1/2015	Quarter1	sweing	Thursday	8	0.80	26.16	1108.0	7080	98	59.0	0.940725
1	1/1/2015	Quarter1	finishing	Thursday	1	0.75	3.94	NaN	960	0	8.0	0.886500
2	1/1/2015	Quarter1	sweing	Thursday	11	0.80	11.41	968.0	3660	50	30.5	0.800570
3	1/1/2015	Quarter1	sweing	Thursday	12	0.80	11.41	968.0	3660	50	30.5	0.800570
4	1/1/2015	Quarter1	sweing	Thursday	6	0.80	25.90	1170.0	1920	50	56.0	0.800382

import pandas as pd
data=pd.read_excel('C:/Users/86155/Desktop/xinguan.xlsx')
#plt.scatter(data['经度'],data['纬度'],
           #s=data['累计确诊'],
           #c=data['累计死亡'],cmap='Gray',
           #alpha=0.4)
#plt.grid()
data.head()

	城市	累计确诊	累计死亡	经度	纬度
0	北京	1651	9	116.46	39.92
1	天津	1372	3	117.20	39.12
2	河北	1671	7	114.52	38.05
3	山西	295	0	112.55	37.87
4	内蒙古	1666	1	111.73	40.83

1	`data.info()`

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   X0      1197 non-null   datetime64[ns]
 1   X1      1197 non-null   object        
 2   X2      1197 non-null   object        
 3   X3      1197 non-null   object        
 4   X4      1197 non-null   int64         
 5   X5      1197 non-null   float64       
 6   X6      1197 non-null   float64       
 7   X7      691 non-null    float64       
 8   X8      1197 non-null   int64         
 9   X9      1197 non-null   int64         
 10  X10     1197 non-null   float64       
 11  X11     1197 non-null   int64         
 12  X12     1197 non-null   int64         
 13  X13     1197 non-null   float64       
 14  X14     1197 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int64(5), object(3)
memory usage: 140.4+ KB

1	`data.isnull()`

	X0	X1	X2	X3	X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14
0	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1192	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1193	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1194	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1195	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1196	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False

1197 rows × 15 columns

1 2	`# 数据类型 data.dtypes`

X0     datetime64[ns]
X1             object
X2             object
X3             object
X4              int64
X5            float64
X6            float64
X7            float64
X8              int64
X9              int64
X10           float64
X11             int64
X12             int64
X13           float64
X14           float64
dtype: object

1 2	`#用中值填补Age的空缺值 data['X7']=data['X7'].fillna(data['X7'].median())`

1	`data.describe()`

	X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14
count	1197.000000	1197.000000	1197.000000	1197.000000	1197.000000	1197.000000	1197.000000	1197.000000	1197.000000	1197.000000	1197.000000
mean	6.426901	0.729632	15.062172	1126.437761	4567.460317	38.210526	0.730159	0.369256	0.150376	34.609858	0.735091
std	3.463963	0.097891	10.943219	1397.653191	3348.823563	160.182643	12.709757	3.268987	0.427848	22.197687	0.174488
min	1.000000	0.070000	2.900000	7.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.233705
25%	3.000000	0.700000	3.940000	970.000000	1440.000000	0.000000	0.000000	0.000000	0.000000	9.000000	0.650307
50%	6.000000	0.750000	15.260000	1039.000000	3960.000000	0.000000	0.000000	0.000000	0.000000	34.000000	0.773333
75%	9.000000	0.800000	24.260000	1083.000000	6960.000000	50.000000	0.000000	0.000000	0.000000	57.000000	0.850253
max	12.000000	0.800000	54.560000	23122.000000	25920.000000	3600.000000	300.000000	45.000000	2.000000	89.000000	1.120437

#  重复值处理

#找出行重复的位置
dIndex = data.duplicated()
dIndex

0       False
1       False
2       False
3       False
4       False
        ...  
1192    False
1193    False
1194    False
1195    False
1196    False
Length: 1197, dtype: bool

1	`# 可以看出没有重复值`

1	`data.head()`

	X0	X1	X2	X3	X4	X5	X6	X7	X8	X9	X13	X14
0	1/1/2015	Quarter1	sweing	Thursday	8	0.80	26.16	1108.0	7080	98	59.0	0.940725
1	1/1/2015	Quarter1	finishing	Thursday	1	0.75	3.94	1039.0	960	0	8.0	0.886500
2	1/1/2015	Quarter1	sweing	Thursday	11	0.80	11.41	968.0	3660	50	30.5	0.800570
3	1/1/2015	Quarter1	sweing	Thursday	12	0.80	11.41	968.0	3660	50	30.5	0.800570
4	1/1/2015	Quarter1	sweing	Thursday	6	0.80	25.90	1170.0	1920	50	56.0	0.800382

import seaborn as sns
plt.rcParams['font.sans-serif']='SimHei'

corr = data[['X0','X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11','X12','X13','X14']].corr()
plt.figure(figsize=(10,10))  # 指定宽和高（单位：英寸）
sns.heatmap(corr,vmax=0.8,annot=True) # vmax设置热力图颜色取值的最大值；annot设置是否显示格子数字
plt.show()

1	`# 可以看出各个特征对期望生产率和实际生产率的影响率差异并不大，所以就有可能是期望的生产率没有切合一定的实际问题才导致的这样的差异`

1 2	`x = data['X0'] y = data['X5']`

plt.plot(x,y)
plt.show()
# seaborn
data= {
    'x':x,
    'y':y
      }
df = pd.DataFrame(data)
sns.lineplot(x='x',y='y',data=df)
plt.show()

1 2	`x = data['X0'] y = data['X14']`

plt.plot(x,y)
plt.show()
# seaborn
data= {
    'x':x,
    'y':y
      }
df = pd.DataFrame(data)
sns.lineplot(x='x',y='y',data=df)
plt.show()

1	`data.head()`

	X0	X1	X2	X3	X4	X5	X6	X7	X8	X9	X13	X14
0	1/1/2015	Quarter1	sweing	Thursday	8	0.80	26.16	1108.0	7080	98	59.0	0.940725
1	1/1/2015	Quarter1	finishing	Thursday	1	0.75	3.94	NaN	960	0	8.0	0.886500
2	1/1/2015	Quarter1	sweing	Thursday	11	0.80	11.41	968.0	3660	50	30.5	0.800570
3	1/1/2015	Quarter1	sweing	Thursday	12	0.80	11.41	968.0	3660	50	30.5	0.800570
4	1/1/2015	Quarter1	sweing	Thursday	6	0.80	25.90	1170.0	1920	50	56.0	0.800382

1	`data['X4'].unique()`

array([ 8,  1, 11, 12,  6,  7,  2,  3,  9, 10,  5,  4], dtype=int64)

1 2	`# 一共有12个团队 data1=data.drop(['X0','X1','X2','X3'],axis=1)`

1	`data1.head()`

	X4	X5	X6	X7	X8	X9	X13	X14
0	8	0.80	26.16	1108.0	7080	98	59.0	0.940725
1	1	0.75	3.94	1039.0	960	0	8.0	0.886500
2	11	0.80	11.41	968.0	3660	50	30.5	0.800570
3	12	0.80	11.41	968.0	3660	50	30.5	0.800570
4	6	0.80	25.90	1170.0	1920	50	56.0	0.800382

# 求出簇内误方差
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
x=pd.DataFrame(scale(data1))
SSE = [] 
k_range = range(1,20)
for k in k_range:
    km = KMeans(n_clusters = k)
    km.fit(x)
    SSE.append(km.inertia_)
SSE

[13167.000000000002,
 10368.001747788787,
 8985.612362104612,
 7956.326143721504,
 6934.1999209667265,
 6364.89500698171,
 5600.08786819714,
 4740.428352871961,
 4360.332443867376,
 3918.830271323268,
 3613.3367590027497,
 3302.354785621584,
 3095.576217636003,
 2963.791234943202,
 2686.863451514291,
 2584.874003399748,
 2427.534059702044,
 2337.8239778345887,
 2250.377455323783]

# 然后利用手肘法找出最佳的k值

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
 
df_features = data1 # 读入数据

# 利用SSE选择k
for k in range(1,20):
    estimator = KMeans(n_clusters=k)  # 构造聚类器
    estimator.fit(df_features[[ 'X4', 'X14']])
X = range(1,20)
plt.xlabel('k')
plt.ylabel('sse')
plt.plot(X,SSE,'o-')
plt.show()

1	`# 可以看出最佳k值为5`

1	`data2 = data1[['X4','X14']]`

# k-means 
clf = KMeans(n_clusters=5)
#print(data2)
clf = clf.fit(data2)
clf.labels_

array([4, 0, 3, ..., 4, 1, 4])

1
2
3

#获取中心值
centroids=clf.cluster_centers_
centroids

array([[ 1.50934579,  0.7954856 ],
       [ 9.49019608,  0.72724343],
       [ 3.99317406,  0.75813807],
       [11.52941176,  0.73337501],
       [ 7.05016722,  0.67570864]])

#  用每簇的中心值来代表这一类的水平

columns=['X4', 'X14']
result=pd.DataFrame(centroids,columns=columns)

result

	X4	X14
0	1.509346	0.795486
1	9.490196	0.727243
2	3.993174	0.758138
3	11.529412	0.733375
4	7.050167	0.675709

1
2
3

# 数据归一
result=(result-result.mean(axis=0))/(result.std(axis=0))
result

	X4	X14
0	-1.286294	1.308114
1	0.685944	-0.244505
2	-0.672487	0.458397
3	1.189877	-0.105002
4	0.082961	-1.417004

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

result['团队分类']=list(['团队1','团队2','团队3','团队4','团队5']) 
p=result.plot.bar(figsize=(20,16),x='团队分类',title='团队分类',legend='beast')
p.grid(linestyle='--')
p.legend(fontsize=11)

<matplotlib.legend.Legend at 0x290ef9f0940>

# 求出簇内误方差
from sklearn.preprocessing import scale
x=pd.DataFrame(scale(data1))
SSE = [] 
k_range = range(1,20)
for k in k_range:
    km = KMeans(n_clusters = k)
    km.fit(x)
    SSE.append(km.inertia_)
SSE

[13167.000000000002,
 10368.001747788787,
 9059.320705761347,
 7910.63025239119,
 6956.651138200236,
 6077.878903205976,
 5511.898303743294,
 4795.324291351495,
 4429.152799214559,
 3906.8041968464554,
 3603.2482162246365,
 3316.165899966484,
 3147.436870420017,
 2878.2859554852253,
 2684.5121690958435,
 2563.069034412572,
 2421.7083978310657,
 2315.6588424369897,
 2228.865852777383]

import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
 
df_features = data1 # 读入数据

# 利用SSE选择k
for k in range(1,20):
    estimator = KMeans(n_clusters=k)  # 构造聚类器
    estimator.fit(df_features[[ 'X4','X5','X6','X7','X8','X9','X10','X11','X12','X13','X14']])
X = range(1,20)
plt.xlabel('k')
plt.ylabel('sse')
plt.plot(X,SSE,'o-')
plt.show()

# k-means 
clf = KMeans(n_clusters=5)

clf = clf.fit(data1)
clf.labels_

array([1, 4, 0, ..., 4, 4, 4])

1
2
3

#获取中心值
centroids=clf.cluster_centers_
centroids

array([[ 7.64179104e+00,  7.33582090e-01,  1.41852239e+01,
         9.67542289e+02,  3.75223881e+03,  2.67114428e+01,
         2.23880597e-02,  1.49253731e-01,  2.43781095e-01,
         3.34527363e+01,  7.22864179e-01],
       [ 5.97450425e+00,  7.15212465e-01,  2.35174788e+01,
         1.06944476e+03,  6.65855524e+03,  4.08356941e+01,
         2.37960340e+00,  7.13881020e-01,  3.03116147e-01,
         5.24617564e+01,  7.15187532e-01],
       [ 5.93333333e+00,  7.23939394e-01,  2.39216364e+01,
         1.08087273e+03,  1.05087273e+04,  4.88242424e+01,
         7.77156117e-16,  7.21644966e-16, -1.66533454e-16,
         5.49757576e+01,  7.34676496e-01],
       [ 5.33333333e+00,  8.00000000e-01,  2.13800000e+01,
         1.94093333e+04,  6.19000000e+03,  7.86666667e+01,
         1.11022302e-16,  0.00000000e+00,  0.00000000e+00,
         5.35833333e+01,  8.83637858e-01],
       [ 6.43432203e+00,  7.39830508e-01,  5.93468220e+00,
         1.02024576e+03,  1.25317797e+03,  3.69194915e+01,
         6.25000000e-02,  3.38983051e-01,  5.08474576e-02,
         1.43908898e+01,  7.53440033e-01]])

#  用每簇的中心值来代表这一类的水平

columns=[ 'X4','X5','X6','X7','X8','X9','X10','X11','X12','X13','X14']
result=pd.DataFrame(centroids,columns=columns)

result

	X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14
0	7.641791	0.733582	14.185224	967.542289	3752.238806	26.711443	2.238806e-02	1.492537e-01	2.437811e-01	33.452736	0.722864
1	5.974504	0.715212	23.517479	1069.444759	6658.555241	40.835694	2.379603e+00	7.138810e-01	3.031161e-01	52.461756	0.715188
2	5.933333	0.723939	23.921636	1080.872727	10508.727273	48.824242	7.771561e-16	7.216450e-16	-1.665335e-16	54.975758	0.734676
3	5.333333	0.800000	21.380000	19409.333333	6190.000000	78.666667	1.110223e-16	0.000000e+00	0.000000e+00	53.583333	0.883638
4	6.434322	0.739831	5.934682	1020.245763	1253.177966	36.919492	6.250000e-02	3.389831e-01	5.084746e-02	14.390890	0.753440

1
2
3

# 数据归一
result=(result-result.mean(axis=0))/(result.std(axis=0))
result

	X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14
0	1.595128	-0.266799	-0.468191	-0.455358	-0.555229	-0.997981	-0.445977	-0.304834	0.865386	-0.471215	-0.562291
1	-0.334401	-0.815573	0.744629	-0.442958	0.285093	-0.281737	1.788331	1.583047	1.278707	0.605367	-0.672696
2	-0.382048	-0.554865	0.797153	-0.441567	1.398317	0.123365	-0.467198	-0.803878	-0.832763	0.747749	-0.392407
3	-1.076420	1.717371	0.466842	1.788828	0.149616	1.636681	-0.467198	-0.803878	-0.832763	0.668888	1.749945
4	0.197740	-0.080134	-1.540432	-0.448945	-1.277798	-0.480328	-0.407957	0.329542	-0.478566	-1.550789	-0.122551

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

result['团队分类']=list(['团队1','团队2','团队3','团队4','团队5']) 
p=result.plot.bar(figsize=(20,16),x='团队分类',title='团队分类',legend='beast')
p.grid(linestyle='--')
p.legend(fontsize=11)

<matplotlib.legend.Legend at 0x290efa33780>

1
2
3

#  把时间类型改成datatime类型

data['X0'] = pd.to_datetime(data['X0'], format='%m/%d/%Y')

#  设置起始时间
import datetime
start = datetime.datetime(2015,1,1)
end = datetime.datetime(2015,3,1)

1
2
3

subset = data[data['X0']>=start]
data3 = subset[subset['X0']<=end]
data3.shape

(1005, 15)

1 2	`# 选出我们要使用的数据 data3`

	X0	X1	X2	X3	X4	X5	X6	X7	X8	X9	X10	X11	X12	X13	X14
0	2015-01-01	Quarter1	sweing	Thursday	8	0.80	26.16	1108.0	7080	98	0.0	0	0	59.0	0.940725
1	2015-01-01	Quarter1	finishing	Thursday	1	0.75	3.94	1039.0	960	0	0.0	0	0	8.0	0.886500
2	2015-01-01	Quarter1	sweing	Thursday	11	0.80	11.41	968.0	3660	50	0.0	0	0	30.5	0.800570
3	2015-01-01	Quarter1	sweing	Thursday	12	0.80	11.41	968.0	3660	50	0.0	0	0	30.5	0.800570
4	2015-01-01	Quarter1	sweing	Thursday	6	0.80	25.90	1170.0	1920	50	0.0	0	0	56.0	0.800382
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1000	2015-03-01	Quarter1	finishing	Sunday	2	0.70	3.90	1039.0	960	0	0.0	0	0	8.0	0.585000
1001	2015-03-01	Quarter1	sweing	Sunday	7	0.80	30.10	934.0	6960	0	3.5	15	0	58.0	0.579511
1002	2015-03-01	Quarter1	finishing	Sunday	1	0.60	3.94	1039.0	3360	0	0.0	0	0	8.0	0.448722
1003	2015-03-01	Quarter1	finishing	Sunday	9	0.75	2.90	1039.0	960	0	0.0	0	0	8.0	0.447083
1004	2015-03-01	Quarter1	finishing	Sunday	7	0.80	4.60	1039.0	3360	0	0.0	0	0	8.0	0.350417

1005 rows × 15 columns

1
2
3

data3['X2'].unique()

#  一共有三个部门

array(['sweing', 'finishing ', 'finishing'], dtype=object)

#  因为我们更注重实际生产率，所以我们需要观察各个因素对实际生产率的影响

#  通过做corr（）相关系数热力图来查看皮尔逊相关系数比较相关性的大小

# 但是我们可以看出各个因素对实际生产率的影响都不是很大，所以我选择把相关系数大于0.1的因素当成主要因素，当然我们可以考虑把小于0.1的因素去除
#  X7 X8 X9 X10 X13相关系数小于0.1

1 2	`data4=data3.drop(['X0','X1','X2','X3'],axis=1) data4.head()`

	X4	X5	X6	X7	X8	X9	X13	X14
0	8	0.80	26.16	1108.0	7080	98	59.0	0.940725
1	1	0.75	3.94	1039.0	960	0	8.0	0.886500
2	11	0.80	11.41	968.0	3660	50	30.5	0.800570
3	12	0.80	11.41	968.0	3660	50	30.5	0.800570
4	6	0.80	25.90	1170.0	1920	50	56.0	0.800382

#  处理完后的数据

y = data4.X14.values
X = data4.drop(['X14'], axis = 1)
X.shape

(1005, 10)

#  对数据进行分割并进行归一化处理
#  作用：去均值和方差归一化。且是针对每一个特征维度来做的，而不是针对样本。（StandardScaler） 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=6)  #随机种子6

from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train = standardScaler.transform(X_train)
X_test = standardScaler.transform(X_test)

#模型构建（逻辑回归）

from sklearn.linear_model import LogisticRegression 

log_reg = LogisticRegression()
log_reg.fit(X_train,y_train.astype('int'))

LogisticRegression()

# LogisticRegression的一些参数

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear')

1	`log_reg.score(X_train,y_train.astype('int'))`

0.9867197875166003

1	`log_reg.score(X_test,y_test.astype('int'))`

0.9761904761904762


from sklearn.metrics import accuracy_score


y_predict_log = log_reg.predict(X_test)

# 调用accuracy_score计算分类准确度（metrices模型评估，指标这里面有accuracy_score这个就是准确率的计算）
accuracy_score(y_test.astype('int'),y_predict_log)

0.9761904761904762

#  使用网格搜索找出更好的模型参数
#  param_grid字典或字典列表（参数）
# 下面是构建parameter grid，其结构是key为参数名称，value是待搜索的数值列表的一个字典结构
param_grid = [
    {
        'C':[0.01,0.1,1,10,100],
        'penalty':['l2','l1'],
        'class_weight':['balanced',None]
    }
]

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(log_reg,param_grid,cv=10,n_jobs=-1)
# 使用 GridSearchCV 高效调参

1 2	`%%time grid_search.fit(X_train,y_train.astype('int'))`

Wall time: 1.69 s





GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid=[{'C': [0.01, 0.1, 1, 10, 100],
                          'class_weight': ['balanced', None],
                          'penalty': ['l2', 'l1']}])

1
2
3

#  获取最佳估算器(模型)

grid_search.best_estimator_

LogisticRegression(C=1)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression(C=0.01, multi_class='ovr', n_jobs=1, solver='liblinear')

1
2
3

# 查看最佳分数

grid_search.best_score_

0.9853859649122807

1
2
3

#  查看最佳参数

grid_search.best_params_

{'C': 1, 'class_weight': None, 'penalty': 'l2'}

1 2	`log_reg = grid_search.best_estimator_ log_reg.score(X_train,y_train.astype('int'))`

0.9867197875166003

1	`log_reg.score(X_test,y_test.astype('int'))`

0.9761904761904762

#查看F1指标
#  F1分数（F1-Score），又称为平衡F分数（BalancedScore），它被定义为精确率和召回率的调和平均数。
#  F1 = 2*（p*r）/(p+r)

from sklearn.metrics import f1_score

f1_score(y_test.astype('int'),y_predict_log)

0.5


from sklearn.metrics import classification_report


print(classification_report(y_test.astype('int'),y_predict_log))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       244
           1       0.75      0.38      0.50         8

    accuracy                           0.98       252
   macro avg       0.86      0.69      0.74       252
weighted avg       0.97      0.98      0.97       252

#绘制混淆矩阵（可能性表格或是错误矩阵），评估分类的准确性

from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_test.astype('int'),y_predict_log)
cnf_matrix

array([[243,   1],
       [  5,   3]], dtype=int64)

def plot_cnf_matirx(cnf_matrix,description):
    class_names = [0,1]
    fig,ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks,class_names)
    plt.yticks(tick_marks,class_names)

#create a heat map
    sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'OrRd',
               fmt = 'g')
    ax.xaxis.set_label_position('top')
    plt.tight_layout()
    plt.title(description, y = 1.1,fontsize=16)
    plt.ylabel('实际值0/1',fontsize=12)
    plt.xlabel('预测值0/1',fontsize=12)
    plt.show()
    
plot_cnf_matirx(cnf_matrix,'Confusion matrix -- Logistic Regression')

decision_scores = log_reg.decision_function(X_test)
# 数据X上的异常打分，分数越高，则该数据点的异常程度越高

from sklearn.metrics import precision_recall_curve

precisions,recalls,thresholds = precision_recall_curve(y_test.astype('int'),decision_scores)

#  绘制AUC图

plt.plot(thresholds,precisions[:-1])
plt.plot(thresholds,recalls[:-1])
plt.grid()
plt.show()

# 绘制ROC曲线

from sklearn.metrics import roc_curve

fprs,tprs,thresholds = roc_curve(y_test.astype('int'),decision_scores)

def plot_roc_curve(fprs,tprs):
    plt.figure(figsize=(8,6),dpi=80)
    plt.plot(fprs,tprs)
    plt.plot([0,1],linestyle='--')
    plt.xticks(fontsize=13)
    plt.yticks(fontsize=13)
    plt.ylabel('TP rate',fontsize=15)
    plt.xlabel('FP rate',fontsize=15)
    plt.title('ROC',fontsize=17)
    plt.show()
    
plot_roc_curve(fprs,tprs)

# 求面积,相当于求得分（AUC,ROC的面积）
from sklearn.metrics import roc_auc_score  #auc:area under curve

roc_auc_score(y_test.astype('int'),decision_scores)

0.825563524590164

逻辑回归模型得分0.82556

#模型创建–KNN临近算法，略过基本模型的创建,直接使用网格搜索进行参数调优

param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,31)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,31)],
        'p':[i for i in range(1,6)]
    }
]

%%time
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

grid_search = GridSearchCV(knn_clf,param_grid)

grid_search.fit(X_train,y_train.astype('int'))

Wall time: 14.3 s





GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                          13, 14, 15, 16, 17, 18, 19, 20, 21,
                                          22, 23, 24, 25, 26, 27, 28, 29, 30],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                          13, 14, 15, 16, 17, 18, 19, 20, 21,
                                          22, 23, 24, 25, 26, 27, 28, 29, 30],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}])

1	`grid_search.best_estimator_`

KNeighborsClassifier(n_neighbors=3)

1
2
3

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=24, p=3,
           weights='distance')

KNeighborsClassifier(n_jobs=1, n_neighbors=24, p=3, weights='distance')

1	`grid_search.best_score_`

0.9840618101545253

1	`grid_search.best_params_`

{'n_neighbors': 3, 'weights': 'uniform'}

1 2	`knn_clf = grid_search.best_estimator_ knn_clf.score(X_train,y_train.astype('int'))`

0.9853917662682603

1	`knn_clf.score(X_test,y_test.astype('int'))`

0.9801587301587301

1	`y_predict_knn = knn_clf.predict(X_test)`

1
2
3

#  查看F1指标

f1_score(y_test.astype('int'),y_predict_knn)

0.5454545454545454

1	`print(classification_report(y_test.astype('int'),y_predict_knn))`

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       244
           1       1.00      0.38      0.55         8

    accuracy                           0.98       252
   macro avg       0.99      0.69      0.77       252
weighted avg       0.98      0.98      0.98       252

#  绘制混淆矩阵

cnf_matrix = confusion_matrix(y_test.astype('int'),y_predict_knn)
cnf_matrix

array([[244,   0],
       [  5,   3]], dtype=int64)

1 2	`# 此处调用前面的绘制函数 plot_cnf_matirx(cnf_matrix,'Confusion matrix -- KNN')`

#  绘制AUC图

y_probabilities = knn_clf.predict_proba(X_test)[:,1]

from sklearn.metrics import precision_recall_curve

precisions,recalls,thresholds = precision_recall_curve(y_test.astype('int'),y_probabilities)

plt.plot(thresholds,precisions[:-1])
plt.plot(thresholds,recalls[:-1])
plt.grid()
plt.show()

#  绘制ROC曲线
from sklearn.metrics import roc_curve

fprs2,tprs2,thresholds = roc_curve(y_test.astype('int'),decision_scores)
# 此处调用前面的绘制函数
plot_roc_curve(fprs2,tprs2)

# 求面积,相当于求得分
from sklearn.metrics import roc_auc_score  #auc:area under curve

roc_auc_score(y_test.astype('int'),y_probabilities)

0.7984118852459017

KNN模型的得分为0.798411

#  模型创建–DecisionTree

from sklearn.tree import DecisionTreeClassifier
dt_clf= DecisionTreeClassifier(random_state=6)

from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        'max_features':['auto','sqrt','log2'],
        'min_samples_split':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18],
        'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11]
    }
]

grid_search = GridSearchCV(dt_clf,param_grid)

grid_search.fit(X_train,y_train.astype('int'))

GridSearchCV(estimator=DecisionTreeClassifier(random_state=6),
             param_grid=[{'max_features': ['auto', 'sqrt', 'log2'],
                          'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                               11],
                          'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                                12, 13, 14, 15, 16, 17, 18]}])

1	`grid_search.best_estimator_`

DecisionTreeClassifier(max_features='auto', min_samples_split=9, random_state=6)

1	`grid_search.best_score_`

0.9814128035320089

1	`grid_search.best_params_`

{'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 9}

1 2	`dt_clf = grid_search.best_estimator_ dt_clf.score(X_train,y_train.astype('int'))`

0.9814077025232404

1	`dt_clf.score(X_test,y_test.astype('int'))`

0.9761904761904762

1	`y_predict_dt = dt_clf.predict(X_test)`

1
2
3

#  查看F1指标

f1_score(y_test.astype('int'),y_predict_dt)

0.4

1	`print(classification_report(y_test.astype('int'),y_predict_dt))`

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       244
           1       1.00      0.25      0.40         8

    accuracy                           0.98       252
   macro avg       0.99      0.62      0.69       252
weighted avg       0.98      0.98      0.97       252

#  绘制混淆矩阵

cnf_matrix = confusion_matrix(y_test.astype('int'),y_predict_dt)
cnf_matrix

array([[244,   0],
       [  6,   2]], dtype=int64)

1 2	`# 此处调用前面的绘制函数 plot_cnf_matirx(cnf_matrix,'Confusion matrix -- DecisionTree')`

y_probabilities = dt_clf.predict_proba(X_test)[:,1]

from sklearn.metrics import precision_recall_curve

precisions,recalls,thresholds = precision_recall_curve(y_test.astype('int'),y_probabilities)

plt.plot(thresholds,precisions[:-1])
plt.plot(thresholds,recalls[:-1])
plt.grid()
plt.show()

#  绘制ROC曲线

from sklearn.metrics import roc_curve
fprs3,tprs3,thresholds3 = roc_curve(y_test.astype('int'),y_probabilities)
# 此处调用前面的绘制函数
plot_roc_curve(fprs3,tprs3)

# 求面积,相当于求得分
from sklearn.metrics import roc_auc_score  #auc:area under curve

roc_auc_score(y_test.astype('int'),y_probabilities)

0.8539959016393442

决策树模型的得分为0.85399

1	`# 通过对比，我们选择XGboost模型`

1	`# 利用XGboost来预测数据`

1
2
3

#  X7 X8 X9 X10 X13相关系数小于0.1

data5=data3.drop(['X0','X1','X2','X3','X7','X8','X9','X10','X13'],axis=1)

1	`data5.head()`

	X4	X5	X6	X14
0	8	0.80	26.16	0.940725
1	1	0.75	3.94	0.886500
2	11	0.80	11.41	0.800570
3	12	0.80	11.41	0.800570
4	6	0.80	25.90	0.800382

1	`# 进行训练和预测`

# 为了正确评估模型性能，将数据划分为训练集和测试集，并在训练集上训练模型，在测试集上验证模型性能。
from sklearn.model_selection import train_test_split

## 选择其类别为0和1的样本 （不包括类别为2的样本）
data_target_part = data5['X14']
data_features_part = data5[[x for x in data5.columns if x != 'X14']]

## 测试集大小为20%， 80%/20%分
x_train, x_test, y_train, y_test = train_test_split(data_features_part, data_target_part, test_size = 0.2, random_state = 2020)

from xgboost.sklearn import XGBClassifier
## 定义 XGBoost模型 
clf = XGBClassifier()
# 在训练集上训练XGBoost模型
clf.fit(x_train, y_train)

# 在训练集和测试集上分布利用训练好的模型进行预测
train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)
from sklearn import metrics

## 利用accuracy（准确度）【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_train,train_predict))
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_test,test_predict))

## 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵)
confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test)
print('The confusion matrix result:\n',confusion_matrix_result)

# 利用热力图对于结果进行可视化
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

1
2
3

# 利用XGBoost进行特征选择

sns.barplot(y=data_features_part.columns, x=clf.feature_importances_)

#调参

# 从sklearn库中导入网格调参函数
from sklearn.model_selection import GridSearchCV

## 定义参数取值范围
learning_rate = [0.1, 0.3, 0.6]
subsample = [0.8, 0.9]
colsample_bytree = [0.6, 0.8]
max_depth = [3,5,8]

parameters = { 'learning_rate': learning_rate,
              'subsample': subsample,
              'colsample_bytree':colsample_bytree,
              'max_depth': max_depth}
model = XGBClassifier(n_estimators = 50)

## 进行网格搜索
clf = GridSearchCV(model, parameters, cv=3, scoring='accuracy',verbose=1,n_jobs=-1)
clf = clf.fit(x_train, y_train)
clf.best_params_##网格搜索后的最优参数

# 在训练集和测试集上分布利用最好的模型参数进行预测

## 定义带参数的 XGBoost模型 
clf = XGBClassifier(colsample_bytree = 0.6, learning_rate = 0.3, max_depth= 8, subsample = 0.9)
# 在训练集上训练XGBoost模型
clf.fit(x_train, y_train)

train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)

## 利用accuracy（准确度）【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_train,train_predict))
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_test,test_predict))

## 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵)
confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test)
print('The confusion matrix result:\n',confusion_matrix_result)

# 利用热力图对于结果进行可视化
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

大家可以根据以下链接下载信息素养大赛校赛题，每年大同小异，考的基本都是那些，本人在21年参加素养大赛校赛时是刷的20年的题，适用。
此链接永久有效
链接：https://pan.baidu.com/s/1svu1k3AfeZfvsHoTbAs_eg
提取码：8888

python+机器学习+数据分析

素养大赛

本文作者: 水寿先生
本文链接: http://chen320.github.io/2022/07/15/2021%E5%B9%B4%E4%BF%A1%E6%81%AF%E7%B4%A0%E5%85%BB%E5%A4%A7%E8%B5%9B%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/
版权声明: 本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

Tomcat问题 Previous

数据分析算法-KNN算法例子 Next