如下所示:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
#-*- coding: utf-8 -*- import pandas as pd import numpy as np from patsy.highlevel import dmatrices #2.7里面是from patsy import dmatrices from statsmodels.stats.outliers_influence import variance_inflation_factor import statsmodels.api as sm import scipy.stats as stats from sklearn.metrics import mean_squared_error import seaborn as sns import matplotlib.pyplot as plt import matplotlib.mlab as mlab import matplotlib #数据获取 ccpp = pd.read_excel( 'CCPP.xlsx' ) ccpp.describe() #绘制各变量之间的散点图 sns.pairplot(ccpp) plt.show() #发电量(PE)与自变量之间的相关系数 a = ccpp.corrwith(ccpp.PE) print (a) #将因变量PE,自变量AT,V,AP和截距项(值为1的1维数值)以数据框的形式组合起来 y,x = dmatrices( 'PE~AT+V+AP' ,data = ccpp,return_type = 'dataframe' ) #构造空的数据框 vif = pd.DataFrame() vif[" "VIF Factor" "] = [variance_inflation_factor(x.values,i) for i in range (x.shape[ 1 ])] vif[" "features" "] = x.columns print (vif) #构建PE与AT,V和AP之间的线性模型 fit = sm.formula.ols( 'PE~AT+V+AP' ,data = ccpp).fit() b = fit.summary() # print(b) #计算模型的RMSE值 pred = fit.predict() c = np.sqrt(mean_squared_error(ccpp.PE,pred)) print (c) #离群点检验 outliers = fit.get_influence() #高杠杆值点(帽子矩阵) leverage = outliers.hat_matrix_diag #dffits值 dffits = outliers.dffits[ 0 ] #学生化残差 resid_stu = outliers.resid_studentized_external #cook距离 cook = outliers.cooks_distance[ 0 ] #covratio值 covratio = outliers.cov_ratio #将上面的几种异常值检验统计量与原始数据集合并 contat1 = pd.concat([pd.Series(leverage,name = 'leverage' ),pd.Series(dffits,name = 'dffits' ), pd.Series(resid_stu,name = 'resid_stu' ),pd.Series(cook,name = 'cook' ), pd.Series(covratio,name = 'covratio' ),],axis = 1 ) ccpp_outliers = pd.concat([ccpp,contat1],axis = 1 ) d = ccpp_outliers.head() print (d) #计算异常值数量的比例 outliers_ratio = sum (np.where((np. abs (ccpp_outliers.resid_stu)> 2 ), 1 , 0 )) / ccpp_outliers.shape[ 0 ] e = outliers_ratio print (e) #删除异常值 ccpp_outliers = ccpp_outliers.loc[np. abs (ccpp_outliers.resid_stu)< = 2 ,] #重新建模 fit2 = sm.formula.ols( 'PE~AT+V+AP' ,data = ccpp_outliers).fit() f = fit2.summary() # print(f) pred2 = fit2.predict() g = np.sqrt(mean_squared_error(ccpp_outliers.PE,pred2)) print (g) # #残差的正态性检验(直方图法) resid = fit2.resid #中文和负号的正常显示 # plt.rcParams['font.sans=serif'] = ['Microsoft YaHei'] plt.rcParams[ 'font.sans-serif' ] = [ 'SimHei' ] # plt.rcParams['font.sans=serif'] = 'sans-serif' plt.rcParams[ 'axes.unicode_minus' ] = False plt.hist(resid,bins = 100 ,normed = True ,color = 'steelblue' ,edgecolor = 'k' ) #设置坐标轴标签和标题 plt.title( '残差直方图' ) plt.ylabel( '密度值' ) #生成正态曲线的数据 x1 = np.linspace(resid. min (),resid. max (), 1000 ) normal = mlab.normpdf(x1,resid.mean(),resid.std()) #绘制正态分布曲线 plt.plot(x1,normal, 'r-' ,linewidth = 2 ,label = '正态分布曲线' ) #生成核密度曲线的数据 kde = mlab.GaussianKDE(resid) x2 = np.linspace(resid. min (),resid. max (), 1000 ) #绘制核密度曲线 plt.plot(x2,kde(x2), 'k-' ,linewidth = 2 ,label = '核密度曲线' ) #去除图形顶部边界和右边界的刻度 plt.tick_params(top = 'off' ,right = 'off' ) #显示图例 plt.legend(loc = 'best' ) #显示图形 plt.show() #生成的正态曲线的数据 pp_qq_plot = sm.ProbPlot(resid) pp_qq_plot.ppplot(line = '45' ) plt.title( 'P-P图' ) pp_qq_plot.qqplot(line = 'q' ) plt.title( 'Q-Q图' ) plt.show() #残差的正态性检验(非参数法) standard_resid = (resid - np.mean(resid)) / np.std(resid) g = stats.kstest(standard_resid, 'norm' ) print (g) # 总结:由于shapiro正态性检验对样本量的需求是5000以内,而本次数据集样本量有9000多,故选择k-s来完成正态性检验。 # 从k-s检验的p值来看,拒绝了残差服从正态分布的假设,即认为残差并不满足正态性假设这个前提。 # 如果残差不服从正态分布的话,建议对Y变量进行box-cox变换处理。 # 由于fit2模型的残差并没有特别明显的偏态(偏度为0.058,接近于0),故这里就不对Y进行变换。 # # import scipy.stats as stats # #找到box-cox变换的Lambda系数 # lamd = stats.boxcox_normmax(vif.y,method = 'mle') # #对y进行变换 # vif['trans_y'] = stats.boxcox(vif.y,lamd) # #建模 # fit3 = sm.formula.ols('y~x1+x2...',data = vif).fit() # fit3.summary() |
以上这篇python3 线性回归验证方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/SunWuKong_Hadoop/article/details/80254848