# Shantanu's Blog

Corporate Consultant

## Gini index calculation

Here is a function that will calculate the weighted gini index for a given feature.

import pandas as pd

url="https://raw.githubusercontent.com/bharat-patidar/Decision-trees/master/data/films.csv"

def gini_calculate(node='gender'):
my_films=films.groupby(['watching', node])[node].count().unstack()
watching_df=my_films.div(my_films.sum(axis=0), axis=1)
watching_gini=watching_df.apply(lambda x: x**2 + (1-x)**2)
watching_gini.loc['total', :] = my_films.sum(axis=0)
watching_gini.loc['grand_total', :] = my_films.sum(axis=0).sum()
x=0
for i in watching_gini.columns:
x = x + watching_gini.loc['total', i] / watching_gini.loc['grand_total', i] * watching_gini.loc['yes', i]
return x

print (gini_calculate(node='employment_status'))
print (gini_calculate(node='gender'))

>>> 0.5033062330623306
>>> 0.522077922077922

# Since weighted gini(gender) > weighted gini(employment), the node split will take on Gender

_____

Function to calculate entropy:

from math import log, e
def entropy3(labels, base=None):
vc = pd.Series(labels).value_counts(normalize=True)
base = e if base is None else base
return -(vc * np.log(vc)/np.log(base)).sum()

def ent(data):
p_data= data.value_counts()/len(data)
print (p_data)
entropy=scipy.stats.entropy(p_data)
return entropy

Labels: