18 November 2021 3 5K Report

I am working on a developing an ID3 implementation. The goal of this program is stated in the outline. There are three steps.

Step 1 Calculate the Message conveyed for the given data set.

Step 2 Repeat for every attribute in the dataset.

Given the two datasets

Test Set

```

Venue,color,Model,Category,Location,weight,Veriety,Material,Volume

1,6,4,4,4,1,1,1,6

2,5,4,4,4,2,6,1,1

1,6,2,1,4,1,4,2,4

1,6,2,1,4,1,2,1,2

2,6,5,5,5,2,2,1,2

1,5,4,4,4,1,6,2,2

1,3,3,3,3,1,6,2,2

```

Training Set

```

Venue,color,Model,Category,Location,weight,Veriety,Material,Volume

2,6,4,4,4,2,2,1,1

1,2,4,4,4,1,6,2,6

1,5,4,4,4,1,2,1,6

2,4,4,4,4,2,6,1,4

1,4,4,4,4,1,2,2,2

2,4,3,3,3,2,1,1,1

1,5,2,1,4,1,6,2,6

1,2,3,3,3,1,2,1,6

2,6,4,4,4,2,3,1,1

```

I'd like to loop over the rows and columns of columns of each dataset to calculate the probability for an attribute.

```

from numpy.core.defchararray import count

import pandas as pd

import numpy as np

import numpy as np

from math import ceil, floor, log2

from sklearn.decomposition import PCA

from numpy import linalg as LA

from sklearn.tree import DecisionTreeClassifier

def calculate_metrics(tp, tn, fn, p, n, fp):

# calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.

accuracy = tp + tn /(p+n)

error_rate = fp + fn /(p + n)

sensitivity = tp/ p

precision = tp/ (tp+fp)

specificity = tn/n

display_metrics(accuracy, error_rate, sensitivity, precision, specificity)

def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):

print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')

def mc(columnName,training_set):

# print(f'Column Name :{columnName}')

# print(f'Column Contents: {training_set[columnName]}')

column = training_set[columnName]

probs = column.value_counts(normalize=True)

# print(f'Probability {probs}')

messageConveyed = -1*np.sum(np.log2(probs)*probs)

# print(f'mc {messageConveyed}')

return messageConveyed

def isNotUnique(s):

a = s.to_numpy() # s.values (pandas= k]

df2 = training_set[training_set[columnName] < k]

print("**********")

print("splitting ")

print(f'df1 {df1}')

print(f'df2 {df2}')

print("**********")

# iii) calculate MC for new splits

# calculate MC for each  attribute of Venue

print("*****************************************")

print("************* iii ***********************")

print(f"calculate MC for new splits")

print(f"calculate MC for each  attribute of {columnName}")

messageConveyed = mc(columnName,training_set)

print(f"MC for {columnName} is {messageConveyed}")

# iv calculculate the weight for each split

# start with venue

print("*****************************************")

print("************* iv  ***********************")

print(f"calculculate the weight for each split ({columnName})")

# Loop

# For each unique value calculate unique_value/total

uniques1 = df1[columnName].unique()

uniques2 = df2[columnName].unique()

total1 = df1[columnName].count()

total2 = df2[columnName].count()

print("*****************************************")

print("*************  v  ***********************")

print(f"calculate the weighted MC (WMC) for the attribute ({columnName})")

print("*****************************************")

print("************* weights for df1  ***********")

print(f"WMC({columnName})")

for unique_value in uniques1:

weight = unique_value/total1

wmc = weight*mc(columnName,df1)

print(f"+= {wmc}")

# v) calculate the weighted MC (WMC) for the attribute

# WMC(venue) = W(1)*MC(1) + W(2)*MC(2)

print("*****************************************")

print("*************  v  ***********************")

print(f"calculate the weighted MC (WMC) for the attribute ({columnName})")

print("*****************************************")

print("************* weights for df2  ***********")

print(f"WMC({columnName})")

for unique_value in uniques2:

weight = unique_value/total2

print(f"{weight} = {unique_value}/{total2}")

messageConveyed = mc(columnName,df2)

wmc += weight*messageConveyed

print(f"{wmc} += {weight}*{messageConveyed}")

# vi) Calculate Gain for the attribute [MC-WMC(venue)]

# Gain(venue) = MC-WMC(venue)

print("*****************************************")

print("*************  vi  **********************")

print(f"Calculate Gain for the {columnName} [{messageConveyed-wmc}]")

gain = messageConveyed-wmc

print(f"gain for {columnName} = {gain}")

# Step 3- Repeat for each split produced by the root

# if all records have the same class then break.

if(isNotUnique(df1[columnName])):

break

if(isNotUnique(df2[columnName])):

break

# Step 4- If every split is free of a mixture of class values, then stop

# expansion of the tree

# # How do we apply prepruning to the data?

# # For post-pruning use the criteria below

# if (N-M)/Q < g*k:

#     # remove subtree

# Step 5- Extract rules in form of if-then-else from the tree

# # true positive

# tp = 0

# # true negative

# tn = 0

# # postive

# p  = 0

# #  negative

# n  = 0

# # false positive

# fp = 0

# calculate_metrics(tp, tn, p, n, fp)

def BayesClassifier():

# use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes

test_set = pd.read_csv("Assignment 2--Test set for Bayes.csv")

training_set = pd.read_csv("Assignment 2--Training set for Bayes.csv")

# prompt user to select either ID3 or Bayes classifier.

selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")

threshold = 0.9 #= input("Please enter a threshold: ")

g = 0.05 #= input("Please enter a value for g: ")

if(selection == "ID3"):

ID3(threshold,g)

if(selection == "Bayes"):

BayesClassifier()

```

More Evan Gertis's questions See All
Similar questions and discussions