I am working on a developing an ID3 implementation. The goal of this program is stated in the outline. There are three steps.
Step 1 Calculate the Message conveyed for the given data set.
Step 2 Repeat for every attribute in the dataset.
Given the two datasets
Test Set
```
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2
```
Training Set
```
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
2,6,4,4,4,2,2,1,1
1,2,4,4,4,1,6,2,6
1,5,4,4,4,1,2,1,6
2,4,4,4,4,2,6,1,4
1,4,4,4,4,1,2,2,2
2,4,3,3,3,2,1,1,1
1,5,2,1,4,1,6,2,6
1,2,3,3,3,1,2,1,6
2,6,4,4,4,2,3,1,1
```
I'd like to loop over the rows and columns of columns of each dataset to calculate the probability for an attribute.
```
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
def calculate_metrics(tp, tn, fn, p, n, fp):
# calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
accuracy = tp + tn /(p+n)
error_rate = fp + fn /(p + n)
sensitivity = tp/ p
precision = tp/ (tp+fp)
specificity = tn/n
display_metrics(accuracy, error_rate, sensitivity, precision, specificity)
def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')
def mc(columnName,training_set):
# print(f'Column Name :{columnName}')
# print(f'Column Contents: {training_set[columnName]}')
column = training_set[columnName]
probs = column.value_counts(normalize=True)
# print(f'Probability {probs}')
messageConveyed = -1*np.sum(np.log2(probs)*probs)
# print(f'mc {messageConveyed}')
return messageConveyed
def isNotUnique(s):
a = s.to_numpy() # s.values (pandas= k]
df2 = training_set[training_set[columnName] < k]
print("**********")
print("splitting ")
print(f'df1 {df1}')
print(f'df2 {df2}')
print("**********")
# iii) calculate MC for new splits
# calculate MC for each attribute of Venue
print("*****************************************")
print("************* iii ***********************")
print(f"calculate MC for new splits")
print(f"calculate MC for each attribute of {columnName}")
messageConveyed = mc(columnName,training_set)
print(f"MC for {columnName} is {messageConveyed}")
# iv calculculate the weight for each split
# start with venue
print("*****************************************")
print("************* iv ***********************")
print(f"calculculate the weight for each split ({columnName})")
# Loop
# For each unique value calculate unique_value/total
uniques1 = df1[columnName].unique()
uniques2 = df2[columnName].unique()
total1 = df1[columnName].count()
total2 = df2[columnName].count()
print("*****************************************")
print("************* v ***********************")
print(f"calculate the weighted MC (WMC) for the attribute ({columnName})")
print("*****************************************")
print("************* weights for df1 ***********")
print(f"WMC({columnName})")
for unique_value in uniques1:
weight = unique_value/total1
wmc = weight*mc(columnName,df1)
print(f"+= {wmc}")
# v) calculate the weighted MC (WMC) for the attribute
# WMC(venue) = W(1)*MC(1) + W(2)*MC(2)
print("*****************************************")
print("************* v ***********************")
print(f"calculate the weighted MC (WMC) for the attribute ({columnName})")
print("*****************************************")
print("************* weights for df2 ***********")
print(f"WMC({columnName})")
for unique_value in uniques2:
weight = unique_value/total2
print(f"{weight} = {unique_value}/{total2}")
messageConveyed = mc(columnName,df2)
wmc += weight*messageConveyed
print(f"{wmc} += {weight}*{messageConveyed}")
# vi) Calculate Gain for the attribute [MC-WMC(venue)]
# Gain(venue) = MC-WMC(venue)
print("*****************************************")
print("************* vi **********************")
print(f"Calculate Gain for the {columnName} [{messageConveyed-wmc}]")
gain = messageConveyed-wmc
print(f"gain for {columnName} = {gain}")
# Step 3- Repeat for each split produced by the root
# if all records have the same class then break.
if(isNotUnique(df1[columnName])):
break
if(isNotUnique(df2[columnName])):
break
# Step 4- If every split is free of a mixture of class values, then stop
# expansion of the tree
# # How do we apply prepruning to the data?
# # For post-pruning use the criteria below
# if (N-M)/Q < g*k:
# # remove subtree
# Step 5- Extract rules in form of if-then-else from the tree
# # true positive
# tp = 0
# # true negative
# tn = 0
# # postive
# p = 0
# # negative
# n = 0
# # false positive
# fp = 0
# calculate_metrics(tp, tn, p, n, fp)
def BayesClassifier():
# use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
test_set = pd.read_csv("Assignment 2--Test set for Bayes.csv")
training_set = pd.read_csv("Assignment 2--Training set for Bayes.csv")
# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9 #= input("Please enter a threshold: ")
g = 0.05 #= input("Please enter a value for g: ")
if(selection == "ID3"):
ID3(threshold,g)
if(selection == "Bayes"):
BayesClassifier()
```