Hi everybody
I have written following stepwise-MLR code in python for selecting informative independent variables:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
def Stepwise_MLR(Xtrain,Ytrain):
import numpy as np
DIM_train=np.shape(Xtrain)
Rows_train=DIM_train[0]
Columns_train=DIM_train[1]
Ones_train=np.ones((Rows_train,1))
A=np.hstack((Ytrain,Xtrain))
B=np.corrcoef(A,rowvar=False)
CC=B[1:Columns_train+1,0].reshape(Columns_train,1)
index_R_max=np.argmax(CC) # the index of the variable with the largest r
var_sel=np.array(index_R_max).reshape(1,1) # index set of variables selected at present
K_selected=1 # Number of variables selected at present
var_available=np.setdiff1d(np.arange(Columns_train),index_R_max).reshape((Columns_train-1),1) # index set of variables still available for selection
K_available=Columns_train-1 # Number of variables still available for selection
# Main routine with entry and exit tests
# The flag_continue variable will be set to zero if no more variables can be either added to or removed from the model
flag_continue=1
yc=Ytrain-(np.mean(Ytrain)*Ones_train)
SS_about_mean=yc.T@yc # Sum of squares about the mean
while flag_continue == 1:
# Entry phase
# 1 - Before adding the candidate variable
Xbefore=np.hstack((Ones_train,Xtrain[:,var_sel].reshape(Rows_train,var_sel.size)))
b=np.linalg.inv(np.transpose(Xbefore)@Xbefore)@np.transpose(Xbefore)@Ytrain
yhat=Xbefore@b
# Evaluation of sum of squares due to regression
yhat_c=yhat-(np.mean(Ytrain)*Ones_train)
SSreg_before=yhat_c.T@yhat_c
# 2 - Testing the inclusion of each candidate variable
F=np.zeros((1,K_available))
# Degrees of freedom for estimation of the pure error variance in the augmented models
dof=Rows_train-(K_selected+1)-1
for index in range (0,K_available):
# Index of the candidate variable to be included
var_included=var_available[[index]]
Xafter=np.hstack((Xbefore, Xtrain[:,var_included].reshape(Rows_train,var_included.size)))
b=np.linalg.inv(np.transpose(Xafter)@Xafter)@np.transpose(Xafter)@Ytrain
yhat = Xafter@b
yhat_c=yhat-(np.mean(Ytrain)*Ones_train)
SSreg_after=yhat_c.T@yhat_c
# Estimate of pure error variance
RSS=SS_about_mean - SSreg_after
variance_hat = RSS/dof
F[0,index]=(SSreg_after - SSreg_before)/variance_hat
Fmax=np.max(F)
index_Fmax=np.argmax(F)
import scipy
from scipy import stats
Fcrit=scipy.stats.f.ppf((1-0.5), 1, dof)
if Fmax > Fcrit: # The variable with the largest F-value is added
var_sel=np.hstack((var_sel,var_available[index_Fmax].reshape(1,1)))
var_available=np.setdiff1d(np.arange(Columns_train),var_sel)
K_selected=K_selected+1
K_available=K_available-1
flag_entry=1 # Indicates that a variable has been added to the model
else:
flag_entry=0 # Indicates that no variable has been added to the model
# End of entry phase
# Exit phase
# 1 - Before removing a candidate variable
Xbefore=np.hstack((Ones_train,Xtrain[:,var_sel].reshape(Rows_train,var_sel.size)))
b=np.linalg.inv(np.transpose(Xbefore)@Xbefore)@np.transpose(Xbefore)@Ytrain
yhat=Xbefore@b
yhat_c=yhat-(np.mean(Ytrain)*Ones_train)
SSreg_before=yhat_c.T@yhat_c
# Estimate of pure error variance
RSS = SS_about_mean - SSreg_before
dof = Rows_train - K_selected - 1
variance_hat = RSS/dof
# 2 - Testing the removal of each candidate variable
F=np.zeros((1,K_selected))
for index in range (0,K_selected):
var_excluded = var_sel[0,index] # Index of the candidate variable to be excluded
# Index set of the predictor variables after excluding the variable under test
var_reduced = np.setdiff1d(var_sel,var_excluded)
Xbefore=np.hstack((Ones_train,Xtrain[:,var_reduced].reshape(Rows_train,var_reduced.size)))
b=np.linalg.inv(np.transpose(Xafter)@Xafter)@np.transpose(Xafter)@Ytrain
yhat = Xafter@b
yhat_c=yhat-(np.mean(Ytrain)*Ones_train)
SSreg_after=yhat_c.T@yhat_c
F[0,index] = (SSreg_before - SSreg_after)/variance_hat
Fmin=np.min(F)
index_Fmin=np.argmin(F)
Fcrit=scipy.stats.f.ppf((1-0.5), 1, dof)
if Fmin