Hi everybody

I have written following stepwise-MLR code in python for selecting informative independent variables:

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

def Stepwise_MLR(Xtrain,Ytrain):

import numpy as np

DIM_train=np.shape(Xtrain)

Rows_train=DIM_train[0]

Columns_train=DIM_train[1]

Ones_train=np.ones((Rows_train,1))

A=np.hstack((Ytrain,Xtrain))

B=np.corrcoef(A,rowvar=False)

CC=B[1:Columns_train+1,0].reshape(Columns_train,1)

index_R_max=np.argmax(CC) # the index of the variable with the largest r

var_sel=np.array(index_R_max).reshape(1,1) # index set of variables selected at present

K_selected=1 # Number of variables selected at present

var_available=np.setdiff1d(np.arange(Columns_train),index_R_max).reshape((Columns_train-1),1) # index set of variables still available for selection

K_available=Columns_train-1 # Number of variables still available for selection

# Main routine with entry and exit tests

# The flag_continue variable will be set to zero if no more variables can be either added to or removed from the model

flag_continue=1

yc=Ytrain-(np.mean(Ytrain)*Ones_train)

SS_about_mean=yc.T@yc # Sum of squares about the mean

while flag_continue == 1:

# Entry phase

# 1 - Before adding the candidate variable

Xbefore=np.hstack((Ones_train,Xtrain[:,var_sel].reshape(Rows_train,var_sel.size)))

b=np.linalg.inv(np.transpose(Xbefore)@Xbefore)@np.transpose(Xbefore)@Ytrain

yhat=Xbefore@b

# Evaluation of sum of squares due to regression

yhat_c=yhat-(np.mean(Ytrain)*Ones_train)

SSreg_before=yhat_c.T@yhat_c

# 2 - Testing the inclusion of each candidate variable

F=np.zeros((1,K_available))

# Degrees of freedom for estimation of the pure error variance in the augmented models

dof=Rows_train-(K_selected+1)-1

for index in range (0,K_available):

# Index of the candidate variable to be included

var_included=var_available[[index]]

Xafter=np.hstack((Xbefore, Xtrain[:,var_included].reshape(Rows_train,var_included.size)))

b=np.linalg.inv(np.transpose(Xafter)@Xafter)@np.transpose(Xafter)@Ytrain

yhat = Xafter@b

yhat_c=yhat-(np.mean(Ytrain)*Ones_train)

SSreg_after=yhat_c.T@yhat_c

# Estimate of pure error variance

RSS=SS_about_mean - SSreg_after

variance_hat = RSS/dof

F[0,index]=(SSreg_after - SSreg_before)/variance_hat

Fmax=np.max(F)

index_Fmax=np.argmax(F)

import scipy

from scipy import stats

Fcrit=scipy.stats.f.ppf((1-0.5), 1, dof)

if Fmax > Fcrit: # The variable with the largest F-value is added

var_sel=np.hstack((var_sel,var_available[index_Fmax].reshape(1,1)))

var_available=np.setdiff1d(np.arange(Columns_train),var_sel)

K_selected=K_selected+1

K_available=K_available-1

flag_entry=1 # Indicates that a variable has been added to the model

else:

flag_entry=0 # Indicates that no variable has been added to the model

# End of entry phase

# Exit phase

# 1 - Before removing a candidate variable

Xbefore=np.hstack((Ones_train,Xtrain[:,var_sel].reshape(Rows_train,var_sel.size)))

b=np.linalg.inv(np.transpose(Xbefore)@Xbefore)@np.transpose(Xbefore)@Ytrain

yhat=Xbefore@b

yhat_c=yhat-(np.mean(Ytrain)*Ones_train)

SSreg_before=yhat_c.T@yhat_c

# Estimate of pure error variance

RSS = SS_about_mean - SSreg_before

dof = Rows_train - K_selected - 1

variance_hat = RSS/dof

# 2 - Testing the removal of each candidate variable

F=np.zeros((1,K_selected))

for index in range (0,K_selected):

var_excluded = var_sel[0,index] # Index of the candidate variable to be excluded

# Index set of the predictor variables after excluding the variable under test

var_reduced = np.setdiff1d(var_sel,var_excluded)

Xbefore=np.hstack((Ones_train,Xtrain[:,var_reduced].reshape(Rows_train,var_reduced.size)))

b=np.linalg.inv(np.transpose(Xafter)@Xafter)@np.transpose(Xafter)@Ytrain

yhat = Xafter@b

yhat_c=yhat-(np.mean(Ytrain)*Ones_train)

SSreg_after=yhat_c.T@yhat_c

F[0,index] = (SSreg_before - SSreg_after)/variance_hat

Fmin=np.min(F)

index_Fmin=np.argmin(F)

Fcrit=scipy.stats.f.ppf((1-0.5), 1, dof)

if Fmin

More Bakhtyar Sepehri's questions See All
Similar questions and discussions