import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import seaborn
from Oaxaca import Oaxaca
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
df_big=pd.read_csv("../data/Big_Data.csv")
df_big['income']=df_big['income']*10000
df_big['card']=df_big['card'].map({
    "yes":1,
    "no":0
})
display(df_big.head())
df_big.dtypes

card             int64
reports          int64
age            float64
income         float64
share          float64
expenditure    float64
owner           object
selfemp         object
dependents       int64
months           int64
majorcards       int64
active           int64
dtype: object


plt.hist(df_big['reports'],bins=5)
plt.ylabel('Frequency')
plt.xlabel("Number of Defaults in History")
plt.title('Default Rate Distribution')

Text(0.5, 1.0, 'Default Rate Distribution')


plt.bar(df_big.groupby('reports')['card'].mean().index,df_big.groupby('reports')['card'].mean().values)
plt.ylabel("Avg Approval Rate")
plt.xlabel("Number of Defaults")
plt.title("Approval Rate Based on Number of Defaults")

Text(0.5, 1.0, 'Approval Rate Based on Number of Defaults')


plt.hist(df_big['income'],bins=5)
plt.ylabel('Frequency')
plt.xlabel("Income")
plt.title('Income Distribution')

Text(0.5, 1.0, 'Income Distribution')


plt.bar(['Not Approved','Approved'],df_big.groupby('card')['income'].mean().values)
plt.ylabel("Avg Income")
plt.title("Avg Income for Approved and Not Approved Applicants")

Text(0.5, 1.0, 'Avg Income for Approved and Not Approved Applicants')


plt.hist(df_big['age'],bins=5)
plt.ylabel('Frequency')
plt.xlabel("Age")
plt.title('Age Distribution')

Text(0.5, 1.0, 'Age Distribution')


plt.bar(['Not Approved','Approved'],df_big.groupby('card')['age'].mean().values)
plt.ylabel("Avg Age")
plt.title("Avg Age for Approved and Not Approved Applicants")

Text(0.5, 1.0, 'Avg Age for Approved and Not Approved Applicants')


df_small=pd.read_csv("../data/Small_Data.csv")
display(df_small.head())
df_small.dtypes

Gender              int64
Age               float64
Debt              float64
Married             int64
BankCustomer        int64
Industry           object
Ethnicity          object
YearsEmployed     float64
PriorDefault        int64
Employed            int64
CreditScore         int64
DriversLicense      int64
Citizen            object
ZipCode             int64
Income              int64
Approved            int64
dtype: object


def func(pct, allvalues):
    absolute = int(pct / 100.*np.sum(allvalues))
    return "{:.1f}%\n({:d})".format(pct, absolute)
plt.figure(figsize =(10, 7))
plt.title("Job Distribution of Applicants")
a=plt.pie(x=df_small['Industry'].value_counts(),
        labels=df_small['Industry'].value_counts().index,
        autopct=lambda pct: func(pct, df_small['Industry'].value_counts()),
         labeldistance=1.1,
         pctdistance=0.9,)


plt.bar(df_small.groupby('Industry')['Approved'].mean().sort_values().index,df_small.groupby('Industry')['Approved'].mean().sort_values().values)
plt.ylabel("Avg Approval Rate")
plt.xticks(rotation=90)
plt.title("Avg Approval Rate By Industry")

Text(0.5, 1.0, 'Avg Approval Rate By Industry')


plt.figure(figsize =(10, 7))
plt.title("Ethnicity Distribution of Applicants")
a=plt.pie(x=df_small['Ethnicity'].value_counts(),
        labels=df_small['Ethnicity'].value_counts().index,
        autopct=lambda pct: func(pct, df_small['Ethnicity'].value_counts()),
         labeldistance=1.1,
         pctdistance=0.9)


df_small['is_black']=[1 if i=="Black" else 0 for i in df_small['Ethnicity']]

plt.bar(["Black","Non-Black"],df_small.groupby('is_black')['Approved'].mean().values)
plt.ylabel("Avg Approval Rate")
plt.title("Approval Rate Based on Ethnicity")

Text(0.5, 1.0, 'Approval Rate Based on Ethnicity')


df_small['Gender_label']=["Male" if i==1 else "Female" for i in df_small['Gender']]
plt.hist(df_small['Gender_label'])
plt.ylabel('Frequency')
plt.title("Gender Distribution of Applicants")

Text(0.5, 1.0, 'Gender Distribution of Applicants')


plt.bar(["Male","Female"],df_small.groupby('Gender')['Approved'].mean().values)
plt.title("Approval Rate Based on Gender")
plt.ylabel("Avg Approval Rate")

Text(0, 0.5, 'Avg Approval Rate')


k_list=list(range(1,50))
accuracyscore=[]
for k in k_list:
    model = KNeighborsClassifier(n_neighbors=k)
    scaler=StandardScaler()
    vec = DictVectorizer(sparse=False)
    pipeline = Pipeline([
        ("vec",vec),
        ("scaler", scaler),
        ("model", model)
    ])
    features=['Gender', 'Age', 'Debt', 'Married', 'BankCustomer','is_black', 
              'YearsEmployed', 'Employed', 'CreditScore', 'ZipCode', 'Income']
    # define the training data
    X_train = df_small[features]
    y_train = df_small['Approved']
    X_train=X_train.to_dict(orient="records")
    
    accuracyscore.append(cross_validate(pipeline, X_train, y_train, cv=10,scoring="accuracy",return_train_score=True)['test_score'].mean())
plt.plot(k_list,accuracyscore)
plt.title("Accuracy vs k (full set of features)")
plt.grid()
plt.xlabel("K neighbors")
plt.ylabel("Accuracy")

Text(0, 0.5, 'Accuracy')


k_list=list(range(1,50))
accuracyscore=[]
for k in k_list:
    model = KNeighborsClassifier(n_neighbors=k)
    scaler=StandardScaler()
    vec = DictVectorizer(sparse=False)
    pipeline = Pipeline([
        ("vec",vec),
        ("scaler", scaler),
        ("model", model)
    ])
    features_withoutgender=['Age', 'Debt', 'Married', 'BankCustomer','is_black', 
              'YearsEmployed', 'Employed', 'CreditScore', 'ZipCode', 'Income']
    # define the training data
    X_train = df_small[features_withoutgender]
    y_train = df_small['Approved']
    X_train=X_train.to_dict(orient="records")
    
    accuracyscore.append(cross_validate(pipeline, X_train, y_train, cv=10,scoring="accuracy",return_train_score=True)['test_score'].mean())
plt.plot(k_list,accuracyscore)
plt.title("Accuracy vs k(without gender)")
plt.grid()
plt.xlabel("K neighbors")
plt.ylabel("Accuracy")

Text(0, 0.5, 'Accuracy')


k_list=list(range(1,50))
accuracyscore=[]
for k in k_list:
    model = KNeighborsClassifier(n_neighbors=k)
    scaler=StandardScaler()
    vec = DictVectorizer(sparse=False)
    pipeline = Pipeline([
        ("vec",vec),
        ("scaler", scaler),
        ("model", model)
    ])
    features_withoutrace=['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 
              'YearsEmployed', 'Employed', 'CreditScore', 'ZipCode', 'Income']
    # define the training data
    X_train = df_small[features_withoutrace]
    y_train = df_small['Approved']
    X_train=X_train.to_dict(orient="records")
    
    accuracyscore.append(cross_validate(pipeline, X_train, y_train, cv=10,scoring="accuracy",return_train_score=True)['test_score'].mean())
plt.plot(k_list,accuracyscore)
plt.title("Accuracy vs k(without ethnicity)")
plt.grid()
plt.xlabel("K neighbors")
plt.ylabel("Accuracy")

Text(0, 0.5, 'Accuracy')


model = LogisticRegression(random_state=20)
scaler=StandardScaler()
vec = DictVectorizer(sparse=False)
pipeline = Pipeline([
    ("vec",vec),
    ("scaler", scaler),
    ("model", model)
])
features=['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'is_black',
              'YearsEmployed', 'Employed', 'CreditScore', 'ZipCode', 'Income']
# define the training data
X_train = df_small[features]
y_train = df_small['Approved']
X_train=X_train.to_dict(orient="records")

print("Accuracy with full set of features: {}".format(cross_validate(pipeline, X_train, y_train, cv=10,scoring="accuracy",return_train_score=True)['test_score'].mean()))

Accuracy with full set of features: 0.7623188405797101


model = LogisticRegression(random_state=20)
scaler=StandardScaler()
vec = DictVectorizer(sparse=False)
pipeline = Pipeline([
    ("vec",vec),
    ("scaler", scaler),
    ("model", model)
])
features_withoutgender=['Age', 'Debt', 'Married', 'BankCustomer', 'is_black',
              'YearsEmployed', 'Employed', 'CreditScore', 'ZipCode', 'Income']
# define the training data
X_train = df_small[features_withoutgender]
y_train = df_small['Approved']
X_train=X_train.to_dict(orient="records")

print("Accuracy without gender: {}".format(cross_validate(pipeline, X_train, y_train, cv=10,scoring="accuracy",return_train_score=True)['test_score'].mean()))

Accuracy without gender: 0.763768115942029


model = LogisticRegression(random_state=20)
scaler=StandardScaler()
vec = DictVectorizer(sparse=False)
pipeline = Pipeline([
    ("vec",vec),
    ("scaler", scaler),
    ("model", model)
])
features_withoutrace=['Gender','Age', 'Debt', 'Married', 'BankCustomer', 
              'YearsEmployed', 'Employed', 'CreditScore', 'ZipCode', 'Income']
# define the training data
X_train = df_small[features_withoutrace]
y_train = df_small['Approved']
X_train=X_train.to_dict(orient="records")

print("Accuracy without ethnicity: {}".format(cross_validate(pipeline, X_train, y_train, cv=10,scoring="f1",return_train_score=True)['test_score'].mean()))

Accuracy without ethnicity: 0.7116827082522499


by="Gender"
endog="Approved"
features=['Debt',"Income",'is_black',"Employed","BankCustomer","CreditScore","PriorDefault"]
df_numarray=pd.get_dummies(df_small[[endog,by]+features], drop_first=True).values
ox = Oaxaca(df_numarray, by=1, endo=0, debug=True)
unexplained,explained,gap=ox.two_fold(round_val=10)
print("Potential Discrimination: {}%".format(unexplained/gap*100))

These are the attempted split values: Float64Index([1.0, 0.0], dtype='float64')
Unexplained Effect: 0.006121929
Explained Effect: -0.037371929
Gap: -0.03125
Potential Discrimination: -19.5901728%


by="is_black"
endog="Approved"
#'Debt','Age',"Income",'Industry','Ethnicity',"Employed","Citizen","PriorDefault","Married"
features=['Gender','Debt','Age',"Income","Employed","BankCustomer","CreditScore","PriorDefault"]
df_numarray=pd.get_dummies(df_small[[endog,by]+features], drop_first=True).values
ox = Oaxaca(df_numarray, by=1, endo=0, debug=True)
unexplained,explained,gap=ox.two_fold(round_val=10)
print("Potential Discrimination: {}%".format(unexplained/gap*100))

These are the attempted split values: Float64Index([0.0, 1.0], dtype='float64')
Unexplained Effect: -0.0734653685
Explained Effect: -0.1584186894
Gap: -0.231884058
Potential Discrimination: 31.681940161664755%

	Gender	Age	Debt	Married	BankCustomer	Industry	Ethnicity	YearsEmployed	PriorDefault	Employed	CreditScore	DriversLicense	Citizen	ZipCode	Income	Approved
0	1	30.83	0.000	1	1	Industrials	White	1.25	1	1	1	0	ByBirth	202	0	1
1	0	58.67	4.460	1	1	Materials	Black	3.04	1	1	6	0	ByBirth	43	560	1
2	0	24.50	0.500	1	1	Materials	Black	1.50	1	0	0	0	ByBirth	280	824	1
3	1	27.83	1.540	1	1	Industrials	White	3.75	1	1	5	1	ByBirth	100	3	1
4	1	20.17	5.625	1	1	Industrials	White	1.71	1	0	0	0	ByOtherMeans	120	0	1

Project: Credit Card Approval Analysis

Names: Jingzhi Yang

https://yship1002.github.io

Motivation:

Project Goal

Background

Oaxaca-Blinder-Kitagawa Decomposition

Datasets

Collaboration Plan

Exploratory Data Analysis

Dataset 1: Credit Card Data from book 'Econometric Analysis'

What is the default rate distribution of this dataset?

How does default rate affect credit card applications?

What is the income distribution of this dataset?

How does income affect credit card applications?

What is the age distribution of this dataset?

How does age affect credit card applications?

Dataset 2: Cleaned Credit approval dataset from UCI

What is the Job distribution of applicants?

How does job types affect credit card application?

What is the Ethnicity distribution of applicants?

What are the approval rates for black and non-black applicants?

What is the Gender distribution of applicants?

What are the approval rates for male and female applicants?

Summary of EDA

Building Models

Part1: k-Nearest Neighbors(KNN)

Part2: Logistic Regression

Summary of Model Building Section:

Oaxaca-Blinder-Kitagawa Decomposition

Is there gender discrimination in credit card application?

Is there racial discrimination on black folks in credit card application?

Conclusion

	card	age	income	share	expenditure	owner	selfemp	dependents	months	majorcards	active
0	1	37.66667	45200.0	0.033270	124.983300	yes	no	3	54	1	12
1	1	33.25000	24200.0	0.005217	9.854167	no	no	3	34	1	13
2	1	33.66667	45000.0	0.004156	15.000000	yes	no	4	58	1	5
3	1	30.50000	25400.0	0.065214	137.869200	no	no	0	25	1	7
4	1	32.16667	97867.0	0.067051	546.503300	yes	no	2	64	1	5