The objective of the machine learning project is to demonstrate methods that can handle unbalanced data.
Visualization Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
ML Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
#Import data
file_path =r'F:\Python\creditcard.csv'
dataset= pd.read_csv(file_path)
dataset.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
dataset.describe()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 284807.000000 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | ... | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 284807.000000 | 284807.000000 |
mean | 94813.859575 | 3.919560e-15 | 5.688174e-16 | -8.769071e-15 | 2.782312e-15 | -1.552563e-15 | 2.010663e-15 | -1.694249e-15 | -1.927028e-16 | -3.137024e-15 | ... | 1.537294e-16 | 7.959909e-16 | 5.367590e-16 | 4.458112e-15 | 1.453003e-15 | 1.699104e-15 | -3.660161e-16 | -1.206049e-16 | 88.349619 | 0.001727 |
std | 47488.145955 | 1.958696e+00 | 1.651309e+00 | 1.516255e+00 | 1.415869e+00 | 1.380247e+00 | 1.332271e+00 | 1.237094e+00 | 1.194353e+00 | 1.098632e+00 | ... | 7.345240e-01 | 7.257016e-01 | 6.244603e-01 | 6.056471e-01 | 5.212781e-01 | 4.822270e-01 | 4.036325e-01 | 3.300833e-01 | 250.120109 | 0.041527 |
min | 0.000000 | -5.640751e+01 | -7.271573e+01 | -4.832559e+01 | -5.683171e+00 | -1.137433e+02 | -2.616051e+01 | -4.355724e+01 | -7.321672e+01 | -1.343407e+01 | ... | -3.483038e+01 | -1.093314e+01 | -4.480774e+01 | -2.836627e+00 | -1.029540e+01 | -2.604551e+00 | -2.256568e+01 | -1.543008e+01 | 0.000000 | 0.000000 |
25% | 54201.500000 | -9.203734e-01 | -5.985499e-01 | -8.903648e-01 | -8.486401e-01 | -6.915971e-01 | -7.682956e-01 | -5.540759e-01 | -2.086297e-01 | -6.430976e-01 | ... | -2.283949e-01 | -5.423504e-01 | -1.618463e-01 | -3.545861e-01 | -3.171451e-01 | -3.269839e-01 | -7.083953e-02 | -5.295979e-02 | 5.600000 | 0.000000 |
50% | 84692.000000 | 1.810880e-02 | 6.548556e-02 | 1.798463e-01 | -1.984653e-02 | -5.433583e-02 | -2.741871e-01 | 4.010308e-02 | 2.235804e-02 | -5.142873e-02 | ... | -2.945017e-02 | 6.781943e-03 | -1.119293e-02 | 4.097606e-02 | 1.659350e-02 | -5.213911e-02 | 1.342146e-03 | 1.124383e-02 | 22.000000 | 0.000000 |
75% | 139320.500000 | 1.315642e+00 | 8.037239e-01 | 1.027196e+00 | 7.433413e-01 | 6.119264e-01 | 3.985649e-01 | 5.704361e-01 | 3.273459e-01 | 5.971390e-01 | ... | 1.863772e-01 | 5.285536e-01 | 1.476421e-01 | 4.395266e-01 | 3.507156e-01 | 2.409522e-01 | 9.104512e-02 | 7.827995e-02 | 77.165000 | 0.000000 |
max | 172792.000000 | 2.454930e+00 | 2.205773e+01 | 9.382558e+00 | 1.687534e+01 | 3.480167e+01 | 7.330163e+01 | 1.205895e+02 | 2.000721e+01 | 1.559499e+01 | ... | 2.720284e+01 | 1.050309e+01 | 2.252841e+01 | 4.584549e+00 | 7.519589e+00 | 3.517346e+00 | 3.161220e+01 | 3.384781e+01 | 25691.160000 | 1.000000 |
8 rows × 31 columns
dataset.shape
(284807, 31)
dataset.hist(bins=50, figsize=(20,20))
plt.show()
# Correlation Matrix
correlation = dataset.corr()
f,ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlation, annot=True, ax=ax, linewidths=0.2,fmt='.2f')
<AxesSubplot:>
plt.figure(figsize=(5,16))
heatmap = sns.heatmap(correlation[['Class']].sort_values(by='Class', ascending=False), vmin=-1, vmax=1, annot=True, cmap='bwr',linewidths=0.5,fmt='.3f')
heatmap.set_title('Features Correlation with Class', fontdict={'fontsize':16}, pad=15);
f, axes = plt.subplots(nrows=2, ncols=2, figsize=(25,15))
f.suptitle('Features With High Negative Correlation', size=35)
sns.boxplot(x='Class', y='V17', data=dataset, ax=axes[0,0])
sns.boxplot(x='Class',y='V14', data=dataset, ax=axes[0,1])
sns.boxplot(x='Class', y='V12', data=dataset, ax=axes[1,0])
sns.boxplot(x='Class', y='V10', data=dataset, ax=axes[1,1])
<AxesSubplot:xlabel='Class', ylabel='V10'>
# Counting number of normal Vs fraudulent transactions
transaction_type = dataset.iloc[:,-1].value_counts()
normal = transaction_type[0]
fraud = transaction_type[1]
plt.figure(figsize=(10,8))
sns.barplot(x=transaction_type.index,y=transaction_type)
plt.title('Normal Vs. Fraudulent Transaction')
plt.xlabel('0: Normal , 1: Fraudulent')
plt.ylabel('No. of transactions')
Text(0, 0.5, 'No. of transactions')
Identifying fraudulent transaction is a binary classification task. In this project, we use number of methods.
1. LogisticRegression()
2. SMOTE()
3. NearMiss()
# Select and normalized features
dataset['normAmount'] = StandardScaler().fit_transform(np.array(dataset['Amount']).reshape(-1, 1))
X=dataset.drop(columns=['Class','Time','Amount']).values
y= dataset['Class'].values
dataset['Class'].value_counts()
0 284315 1 492 Name: Class, dtype: int64
# splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=2)
LogReg = LogisticRegression()
LogReg.fit(X_train,y_train)
y_pred = LogReg.predict(X_test)
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
<AxesSubplot:xlabel='Predicted', ylabel='Actual'>
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.9993616138402119
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 1.00 1.00 1.00 93843 1 0.89 0.67 0.76 144 accuracy 1.00 93987 macro avg 0.94 0.83 0.88 93987 weighted avg 1.00 1.00 1.00 93987
sm = SMOTE(random_state=2)
X_sm, y_sm = sm.fit_sample(X,y)
X_train,X_test,y_train,y_test = train_test_split(X_sm, y_sm,test_size=0.33,random_state=2)
keys=[0,1]
vals_b4_sm=[sum(y_train==0),sum(y_train==1)]
vals_sm=[sum(y_train_sm==0),sum(y_train_sm==1)]
plt.figure(figsize=(8,6))
sns.barplot(x=transaction_type.index,y=transaction_type)
plt.title('Imbalance Data')
plt.xlabel('0: Normal , 1: Fraudulent')
plt.ylabel('No. of transactions')
Text(0, 0.5, 'No. of transactions')
plt.figure(figsize=(8,6))
sns.barplot(x=keys,y=vals_sm)
plt.title('SMOTE data')
plt.xlabel('0: Normal , 1: Fraudulent')
plt.ylabel('No. of transactions')
Text(0, 0.5, 'No. of transactions')
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=[10,6])
sns.barplot(x=transaction_type.index,y=transaction_type, ax=ax1)
sns.barplot(x=keys,y=vals_sm, ax=ax2)
ax1.title.set_text('Imbalance Data')
ax2.title.set_text('SMOTE Data')
LogReg_sm = LogisticRegression()
LogReg_sm.fit(X_train_sm,y_train_sm)
y_pred_sm = LogReg.predict(X_test)
confusion_matrix = pd.crosstab(y_test, y_pred_sm, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
<AxesSubplot:xlabel='Predicted', ylabel='Actual'>
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred_sm))
Accuracy: 0.8162144014324693
print(classification_report(y_test, y_pred_sm))
precision recall f1-score support 0 0.73 1.00 0.84 93492 1 1.00 0.63 0.78 94156 accuracy 0.82 187648 macro avg 0.87 0.82 0.81 187648 weighted avg 0.87 0.82 0.81 187648
NM = NearMiss()
X_NM, y_NM = NM.fit_sample(X,y)
X_train, X_test, y_train, y_test = train_test_split(X_NM, y_NM,test_size=0.33, random_state=2)
LogReg_NM = LogisticRegression(max_iter=10000)
LogReg_NM.fit(X_train_NM,y_train_NM)
y_pred_NM = LogReg.predict(X_test)
confusion_matrix = pd.crosstab(y_test, y_pred_NM, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
<AxesSubplot:xlabel='Predicted', ylabel='Actual'>
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred_NM))
Accuracy: 0.8215384615384616
print(classification_report(y_test, y_pred_NM))
precision recall f1-score support 0 0.74 1.00 0.85 162 1 1.00 0.64 0.78 163 accuracy 0.82 325 macro avg 0.87 0.82 0.82 325 weighted avg 0.87 0.82 0.82 325