import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import eli5
from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from typing import List, Union, Dict
# Warnings will be used to silence various model warnings for tidier output
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
np.random.seed(0)
Source:
https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29
Professor Dr. Hans Hofmann Institut f"ur Statistik und "Okonometrie Universit"at Hamburg FB Wirtschaftswissenschaften Von-Melle-Park 5 2000 Hamburg 13
This file has been edited and several indicator variables added to make it suitable for algorithms which cannot cope with categorical variables. Several attributes that are ordered categorical (such as attribute 17) have been coded as integer.
feature_list = ['CurrentAcc', 'NumMonths', 'CreditHistory', 'Purpose', 'CreditAmount',
'Savings', 'EmployDuration', 'PayBackPercent', 'Gender', 'Debtors',
'ResidenceDuration', 'Collateral', 'Age', 'OtherPayBackPlan', 'Property',
'ExistingCredit', 'Job', 'Dependents', 'Telephone', 'Foreignworker', 'CreditStatus']
german_xai = pd.read_csv('C:/Users/krish/Downloads/german.data.txt',names = feature_list, delimiter=' ')
german_xai.head()
german_xai.shape
The dataset has 1000 entries with 21 fields.
type(german_xai)
german_xai.head(10)
german_xai.columns
List of fields in the source dataset are listed above
german_xai.dtypes
Datatypes of each field is displayed above
import klib
klib.missingval_plot(german_xai)
Here, first we are mapping the encrypted domain values of each field to its corresponding actual values depending on the description provided in the UCI machine learning repository.
Gender field desc:
german_xai['Gender'].value_counts()
#german_xai.replace({'Marital_Status':{'A93':'Single','A91':'divorced/married/widowed','A92':'divorced/married/widowed','A94':'divorced/married/widowed'},'Gender':{'A91':'1','A93':'1','A94':'1','A92':'0'}},inplace=True)
german_xai.replace({'Gender':{'A91':'1','A93':'1','A94':'1','A92':'0'}},inplace=True)
german_xai['Gender'].value_counts()
#german_xai['Age'].value_counts()
german_xai['Age']=german_xai['Age'].apply(lambda x: np.int(x >= 26))
german_xai['Age'].value_counts()
Entries with age greater than or equal to 26yrs is encoded as 1 otherwise 0
#Encoding target field
german_xai.CreditStatus.value_counts()
german_xai['CreditStatus'].replace({1:1 , 2: 0}, inplace=True)
german_xai.CreditStatus.value_counts()
Target field CreditStatus is encoded as 1 = Good, 0 = Bad (positive class) ; in actual data 1 = Good, 2 = Bad. https://aif360.readthedocs.io/en/latest/modules/generated/aif360.datasets.GermanDataset.html#aif360.datasets.GermanDataset
Status of checking account desc:
german_xai['CurrentAcc'].replace({'A11':'LT200' , 'A12': 'LT200','A13': 'GE200','A14': 'None'}, inplace=True)
german_xai.CurrentAcc.value_counts()
Employment duration desc:
german_xai['EmployDuration'].replace({'A71':'unemployed' , 'A72': 'LT1','A73': '1-4','A74': '4-7', 'A75': 'GE7'}, inplace=True)
german_xai.EmployDuration.value_counts()
Credit History desc:
german_xai['CreditHistory'].replace({'A30':'none/paid' , 'A31': 'none/paid','A32': 'none/paid','A33': 'Delay', 'A34': 'other'}, inplace=True)
german_xai['CreditHistory'].value_counts()
Savings Desc:
german_xai['Savings'].replace({'A61':'LT500' , 'A62': 'LT500','A63': 'GT500','A64': 'GT500', 'A65': 'none'}, inplace=True)
german_xai['Savings'].value_counts()
Debtors desc: Other debtors / guarantors
german_xai['Debtors'].replace({'A101':'none' , 'A102': 'co-applicant','A103': 'guarantor'}, inplace=True)
german_xai['Debtors'].value_counts()
Collateral desc:
german_xai['Collateral'].replace({'A121':'real_estate' , 'A122': 'savings/life_insurance','A123': 'car/other', 'A124':'unknown/none'}, inplace=True)
german_xai['Collateral'].value_counts()
Property: Housing
german_xai['Property'].replace({'A151':'rent' , 'A152': 'own','A153': 'free'}, inplace=True)
german_xai['Property'].value_counts()
Telephone desc:
Foreign worker
german_xai['Foreignworker'].replace({'A201':1 , 'A202': 0}, inplace=True)
german_xai['Telephone'].replace({'A191':0 , 'A192': 1}, inplace=True)
german_xai['Telephone'].value_counts()
german_xai['Foreignworker'].value_counts()
Purpose desc:
german_xai['Purpose'].replace({'A40':'CarNew' , 'A41': 'CarUsed' , 'A42': 'furniture/equip','A43':'radio/tv','A44':'domestic app','A45':'repairs','A46':'education','A47':'vacation','A48':'retraining','A49':'biz','A410':'others'}, inplace=True)
german_xai['Purpose'].value_counts()
Job desc:
german_xai['Job'].replace({'A171':'unemp/unskilled-non_resident' , 'A172': 'unskilled-resident','A173': 'skilled_employee','A174':'management/self-emp/officer/highly_qualif_emp'}, inplace=True)
german_xai['Job'].value_counts()
Other installment plans desc
german_xai['OtherPayBackPlan'].replace({'A141':'bank' , 'A142': 'stores','A143': 'none'}, inplace=True)
german_xai['OtherPayBackPlan'].value_counts()
german_xai.head()
german_xai = german_xai.reindex(columns=['CurrentAcc','NumMonths', 'CreditHistory', 'Purpose', 'CreditAmount',
'Savings', 'EmployDuration', 'PayBackPercent', 'Gender', 'Debtors',
'ResidenceDuration', 'Collateral', 'Age', 'OtherPayBackPlan', 'Property',
'ExistingCredit', 'Job', 'Dependents', 'Telephone', 'Foreignworker', 'CreditStatus'])
##german_xai.head()
german_xai.to_csv('C:/Users/krish/Downloads/German-mapped_upd.csv', index=False)
German_df = pd.read_csv('C:/Users/krish/Downloads/German-mapped_upd.csv')
print(German_df.shape)
print (German_df.columns)
corrMatrix = round(German_df.corr(),1)
corrMatrix
plt.figure(figsize=(15,15))
sns.heatmap(corrMatrix, annot=True,cmap="Blues")
plt.show()
klib.corr_plot(German_df,annot=False)
klib.corr_plot(German_df,target='CreditStatus')
klib.corr_mat(German_df)
klib.cat_plot(German_df)
klib.dist_plot(German_df)
import matplotlib.pyplot as plt
import numpy as np
age_count=German_df.Age.value_counts(sort=True)
print(age_count)
plt.figure(figsize=(10,5))
age_count.plot(kind='bar', color='skyblue', rot=0)
plt.ylabel('Frequency',fontsize=12,color='green')
plt.xlabel('Age',fontsize=12,color='green')
plt.suptitle('Distribution of Age field',fontsize=15,color='orange',fontweight='bold')
plt.annotate(age_count[1],xy=(0,300),verticalalignment="top",horizontalalignment="center")
plt.annotate(age_count[0],xy=(1,100),verticalalignment="top",horizontalalignment="center")
LABELS=["1:Age>26","0:Age<26"]
plt.xticks(range(2),LABELS)
plt.figure(figsize=(10,5))
plt.hist(German_df.CreditAmount, color='tomato')
plt.ylabel('Frequency')
plt.xlabel('Credit Amount')
plt.suptitle('Distribution of Credit Amount field',fontsize=15,color='slategrey',fontweight='bold')
plt.figure(figsize=(10,5))
plt.hist(German_df.NumMonths, color='tan')
plt.ylabel('Frequency')
plt.xlabel('Number of Months')
plt.suptitle('Distribution of NumMonths field',fontsize=15,color='teal',fontweight='bold')
target_count=German_df.CreditStatus.value_counts(sort=True)
print(target_count)
plt.figure(figsize=(10,5))
target_count.plot(kind='bar', color='gold', rot=0)
plt.ylabel('Frequency',fontsize=12,color='green')
plt.xlabel('Credit Status',fontsize=12,color='green')
plt.suptitle('Distribution of Credit Status field',fontsize=15,color='red',fontweight='bold')
plt.annotate(target_count[1],xy=(0,300),verticalalignment="top",horizontalalignment="center")
plt.annotate(target_count[0],xy=(1,200),verticalalignment="top",horizontalalignment="center")
LABELS=["1:Good credit score","0:Bad credit score"]
plt.xticks(range(2),LABELS)
German_df['Age'].describe()
German_df.Gender.unique()
Gender_count=German_df.Gender.value_counts()
print(Gender_count)
plt.figure(figsize=(10,5))
Gender_count.plot(kind='bar', color='pink', rot=0)
plt.ylabel('Frequency',fontsize=12,color='blue')
plt.xlabel('Gender',fontsize=12,color='blue')
plt.suptitle('Distribution of Gender field',fontsize=15,color='Green',fontweight='bold')
plt.annotate(Gender_count[1],xy=(0,300),verticalalignment="top",horizontalalignment="center")
plt.annotate(Gender_count[0],xy=(1,200),verticalalignment="top",horizontalalignment="center")
LABELS=["1:Male","0:Female"]
plt.xticks(range(2),LABELS)
colour=['blue','pink','orange','green','tan','violet','olive','gold','tomato','skyblue']
for i,j in zip(German_df.columns,colour):
field_count=German_df[i].value_counts()
#print(field_count)
plt.figure(figsize=(10,5))
field_count.plot(kind='bar', color=j, rot=0)
plt.ylabel('Frequency',fontsize=12,color='black')
plt.xlabel(i,fontsize=12,color='black')
plt.suptitle('Distribution of '+ i,fontsize=15,color='Green',fontweight='bold')
german_xai=pd.get_dummies(German_df,columns=['CurrentAcc','CreditHistory','Purpose','Savings','EmployDuration','Debtors','Collateral','OtherPayBackPlan','Property','Job'])
german_xai.head()
german_xai.columns
german_xai = german_xai.reindex(columns=['NumMonths', 'CreditAmount', 'PayBackPercent', 'Gender',
'ResidenceDuration', 'Age', 'ExistingCredit', 'Dependents', 'Telephone',
'Foreignworker', 'CurrentAcc_GE200',
'CurrentAcc_LT200', 'CurrentAcc_None', 'CreditHistory_Delay',
'CreditHistory_none/paid', 'CreditHistory_other', 'Purpose_CarNew',
'Purpose_CarUsed', 'Purpose_biz', 'Purpose_domestic app',
'Purpose_education', 'Purpose_furniture/equip', 'Purpose_others',
'Purpose_radio/tv', 'Purpose_repairs', 'Purpose_retraining',
'Savings_GT500', 'Savings_LT500', 'Savings_none', 'EmployDuration_1-4',
'EmployDuration_4-7', 'EmployDuration_GE7', 'EmployDuration_LT1',
'EmployDuration_unemployed', 'Debtors_co-applicant',
'Debtors_guarantor', 'Debtors_none', 'Collateral_car/other',
'Collateral_real_estate', 'Collateral_savings/life_insurance',
'Collateral_unknown/none', 'OtherPayBackPlan_bank',
'OtherPayBackPlan_none', 'OtherPayBackPlan_stores', 'Property_free',
'Property_own', 'Property_rent',
'Job_management/self-emp/officer/highly_qualif_emp',
'Job_skilled_employee', 'Job_unemp/unskilled-non_resident',
'Job_unskilled-resident','CreditStatus'])
german_xai.head()
from sklearn.preprocessing import MinMaxScaler #since the field is not normally distributed
scaler = MinMaxScaler()
german_xai[['CreditAmount']]=scaler.fit_transform(german_xai[['CreditAmount']])
german_xai.head()
german_xai.to_csv('C:/Users/krish/Downloads/German-encoded_upd.csv', index=False)
X = german_xai.iloc[:, :-1]
y = german_xai['CreditStatus']
X.head()
y.head()
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=40,stratify=y)
german_xai.dtypes
german_xai.shape
import klib
klib.missingval_plot(X)
klib.missingval_plot(y)
from sklearn.feature_selection import mutual_info_classif
mutual_info=mutual_info_classif(X_train, y_train,random_state=40)
mutual_info
Estimate mutual information for a discrete target variable.
Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
X.columns
mutual_info=pd.Series(mutual_info)
mutual_info.index=X_train.columns
mutual_info.sort_values(ascending=False)
mutual_info.sort_values(ascending=False).plot.bar(figsize=(15,5))
Selecting top 25% features having highest dependencies w.r.to target variable CreditStatus along with protected variables under consideration age, gender, marital status.
mutual_info.sort_values(ascending=False)[0:10]
german_xai_imp=german_xai[['CurrentAcc_None',
'NumMonths',
'CurrentAcc_LT200',
'CreditHistory_Delay',
'CreditHistory_none/paid',
'Collateral_savings/life_insurance',
'CurrentAcc_GE200',
'Purpose_repairs',
'CreditAmount',
'Purpose_radio/tv',
'Gender','Age','CreditStatus']]
german_xai_imp.head()
german_xai_imp.dtypes
corrMatrix = round(german_xai_imp.corr(),1)
corrMatrix
klib.corr_plot(german_xai_imp,annot=False)
corrMatrix1 = round(german_xai_imp.corr(),1)
corrMatrix1
plt.figure(figsize=(15,15))
sns.heatmap(corrMatrix1, annot=True,cmap="Blues")
plt.show()
german_upd=german_xai_imp.drop(['CurrentAcc_LT200','CreditAmount'],axis=1)
german_upd
corrMatrix2 = round(german_upd.corr(),1)
corrMatrix2
plt.figure(figsize=(15,15))
sns.heatmap(corrMatrix2, annot=True,cmap="Blues")
plt.show()
No higher correlation is observed between input variables (except gender, marital status (0.7) and credit amount, num of months (0.6) and between target variable and input variables. But since we are trying to understand the impact of protected variables let us retain them without dropping.
german_upd.to_csv('C:/Users/krish/Downloads/German-reduced_upd.csv', index=False)
from IPython.display import Image
Image(filename='C:/Users/krish/Desktop/MAIN PJT/list of protected variables.png',width=500,height=30)
From the above, we have 3 protected fields in our dataset:
1. Gender
2. Age
Now, let us identify previlege class in each protected attribute.
print(german_upd['Gender'].value_counts())
german_upd.groupby(['Gender'])['CreditStatus'].mean()
#https://arxiv.org/pdf/1810.01943.pdf, https://arxiv.org/pdf/2005.12379.pdf
Males(1) are more than females and for males(1) target variable CreditScore is more favorable having higher value for given number of males than female group average. Hence male(1) is privelieged class.
print(german_upd['Age'].value_counts())
german_upd.groupby(['Age'])['CreditStatus'].mean()
Age >26: 1; else 0; so ppl above 26 are more and group average of ppl with age >26 is higher than the group of age < 26 ,so age(1) is priveleiged group
german_upd.columns