avatar

机器学习a-z 3 多元线性回归

多元线性回归

原理:

线性回归的限定条件

虚拟变量

虚拟变量的陷阱:一定要去除线性相关量

模型建立的5种方法

常用的是反向淘汰

python实现

导入数据和包
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
处理分类数据
#处理分类数据
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

#虚拟变量陷阱,删除一列
X = X[:,1:]
划分训练集和测试集
# 训练集和测试集分类
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)
特征缩放

简单线性回归包含了特征缩放,所以就不用自己写了

导入线性回归标准库,创建回归器
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
用回归器预测测试集
#用回归器预测测试集应变量的结果
y_pred = regressor.predict(X_test)
反向淘汰法
#反向淘汰
#1. 先定义一个们康
#2. p-value 越高,统计显著性越低
#3. 每个变量的p-value,大于门槛,淘汰

X_opt = X_train[:,[0,1,2,3,4,5]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#移除最大的那个变量,x2 对应的是否在newyork

X_opt = X_train[:,[0,1,3,4,5]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#移除最大的那个x1

X_opt = X_train[:,[0,3,4,5]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#可以发现x2又是最大的

X_opt = X_train[:,[0,3,5]] #0.71还要删除嘛? 0.05显著性门槛
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()

X_opt = X_train[:,[0,3]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#第三列是啥? 就是研发投入

image.png

image.png

image.png

image.png

image.png

image.png

代码
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

#处理分类数据
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

#虚拟变量陷阱,删除一列
X = X[:,1:]


# 训练集和测试集分类
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)

#数据的缩放,不用了,回归器实现了
# 数据预处理完毕

#导入线性回归标准库,创建回归器
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)




#预测数据
y_pred = regressor.predict(X_test)

#自变量选择 反向淘汰法
#准备工作 因为这个函数不包常数项,给他加上一列全是1
import statsmodels.formula.api as sm
X_train = np.append(arr = np.ones((40,1)), values = X_train, axis = 1)
#反向淘汰
#1. 先定义一个们康
#2. p-value 越高,统计显著性越低
#3. 每个变量的p-value,大于门槛,淘汰

X_opt = X_train[:,[0,1,2,3,4,5]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#移除最大的那个变量,x2 对应的是否在newyork

X_opt = X_train[:,[0,1,3,4,5]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#移除最大的那个x1

X_opt = X_train[:,[0,3,4,5]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#可以发现x2又是最大的

X_opt = X_train[:,[0,3,5]] #0.71还要删除嘛? 0.05显著性门槛
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()

X_opt = X_train[:,[0,3]] #
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
regressor_OLS.summary()
#第三列是啥? 就是研发投入
文章作者: Chen
文章链接: https://vccyb.gitee.io/myblog/2020/03/31/ML/ML-3/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 东川
打赏
  • 微信
    微信
  • 支付寶
    支付寶

评论