import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss


#Import data
file_path =r'F:\Python\creditcard.csv'
dataset= pd.read_csv(file_path)


dataset.head()


dataset.describe()


dataset.shape

(284807, 31)


dataset.hist(bins=50, figsize=(20,20))
plt.show()


# Correlation Matrix
correlation = dataset.corr()
f,ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlation, annot=True, ax=ax, linewidths=0.2,fmt='.2f')

<AxesSubplot:>


plt.figure(figsize=(5,16))
heatmap = sns.heatmap(correlation[['Class']].sort_values(by='Class', ascending=False), vmin=-1, vmax=1, annot=True, cmap='bwr',linewidths=0.5,fmt='.3f')
heatmap.set_title('Features Correlation with Class', fontdict={'fontsize':16}, pad=15);


f, axes = plt.subplots(nrows=2, ncols=2, figsize=(25,15))
f.suptitle('Features With High Negative Correlation', size=35)
sns.boxplot(x='Class', y='V17', data=dataset, ax=axes[0,0])
sns.boxplot(x='Class',y='V14', data=dataset, ax=axes[0,1])
sns.boxplot(x='Class', y='V12', data=dataset, ax=axes[1,0])
sns.boxplot(x='Class', y='V10', data=dataset, ax=axes[1,1])

<AxesSubplot:xlabel='Class', ylabel='V10'>


# Counting number of normal Vs fraudulent transactions
transaction_type = dataset.iloc[:,-1].value_counts()
normal = transaction_type[0]
fraud = transaction_type[1]


plt.figure(figsize=(10,8))
sns.barplot(x=transaction_type.index,y=transaction_type)
plt.title('Normal Vs. Fraudulent Transaction')
plt.xlabel('0: Normal , 1: Fraudulent')
plt.ylabel('No. of transactions')

Text(0, 0.5, 'No. of transactions')


# Select and normalized features
dataset['normAmount'] = StandardScaler().fit_transform(np.array(dataset['Amount']).reshape(-1, 1))
X=dataset.drop(columns=['Class','Time','Amount']).values
y= dataset['Class'].values


dataset['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64


# splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=2)


LogReg = LogisticRegression()
LogReg.fit(X_train,y_train)
y_pred = LogReg.predict(X_test)


confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)

<AxesSubplot:xlabel='Predicted', ylabel='Actual'>


print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.9993616138402119


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93843
           1       0.89      0.67      0.76       144

    accuracy                           1.00     93987
   macro avg       0.94      0.83      0.88     93987
weighted avg       1.00      1.00      1.00     93987


sm = SMOTE(random_state=2)


X_sm, y_sm = sm.fit_sample(X,y)


X_train,X_test,y_train,y_test = train_test_split(X_sm, y_sm,test_size=0.33,random_state=2)


keys=[0,1]
vals_b4_sm=[sum(y_train==0),sum(y_train==1)]
vals_sm=[sum(y_train_sm==0),sum(y_train_sm==1)]


plt.figure(figsize=(8,6))
sns.barplot(x=transaction_type.index,y=transaction_type)
plt.title('Imbalance Data')
plt.xlabel('0: Normal , 1: Fraudulent')
plt.ylabel('No. of transactions')

Text(0, 0.5, 'No. of transactions')


plt.figure(figsize=(8,6))
sns.barplot(x=keys,y=vals_sm)
plt.title('SMOTE data')
plt.xlabel('0: Normal , 1: Fraudulent')
plt.ylabel('No. of transactions')

Text(0, 0.5, 'No. of transactions')


fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=[10,6])
sns.barplot(x=transaction_type.index,y=transaction_type, ax=ax1)
sns.barplot(x=keys,y=vals_sm, ax=ax2)
ax1.title.set_text('Imbalance Data')
ax2.title.set_text('SMOTE Data')


LogReg_sm = LogisticRegression()
LogReg_sm.fit(X_train_sm,y_train_sm)
y_pred_sm = LogReg.predict(X_test)


confusion_matrix = pd.crosstab(y_test, y_pred_sm, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)

<AxesSubplot:xlabel='Predicted', ylabel='Actual'>


print('Accuracy: ',metrics.accuracy_score(y_test, y_pred_sm))

Accuracy:  0.8162144014324693


print(classification_report(y_test, y_pred_sm))

              precision    recall  f1-score   support

           0       0.73      1.00      0.84     93492
           1       1.00      0.63      0.78     94156

    accuracy                           0.82    187648
   macro avg       0.87      0.82      0.81    187648
weighted avg       0.87      0.82      0.81    187648


NM = NearMiss()


X_NM, y_NM = NM.fit_sample(X,y)


X_train, X_test, y_train, y_test = train_test_split(X_NM, y_NM,test_size=0.33, random_state=2)


LogReg_NM = LogisticRegression(max_iter=10000)
LogReg_NM.fit(X_train_NM,y_train_NM)
y_pred_NM = LogReg.predict(X_test)


confusion_matrix = pd.crosstab(y_test, y_pred_NM, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)

<AxesSubplot:xlabel='Predicted', ylabel='Actual'>


print('Accuracy: ',metrics.accuracy_score(y_test, y_pred_NM))

Accuracy:  0.8215384615384616


print(classification_report(y_test, y_pred_NM))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       162
           1       1.00      0.64      0.78       163

    accuracy                           0.82       325
   macro avg       0.87      0.82      0.82       325
weighted avg       0.87      0.82      0.82       325

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
count	284807.000000	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	...	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	284807.000000	284807.000000
mean	94813.859575	3.919560e-15	5.688174e-16	-8.769071e-15	2.782312e-15	-1.552563e-15	2.010663e-15	-1.694249e-15	-1.927028e-16	-3.137024e-15	...	1.537294e-16	7.959909e-16	5.367590e-16	4.458112e-15	1.453003e-15	1.699104e-15	-3.660161e-16	-1.206049e-16	88.349619	0.001727
std	47488.145955	1.958696e+00	1.651309e+00	1.516255e+00	1.415869e+00	1.380247e+00	1.332271e+00	1.237094e+00	1.194353e+00	1.098632e+00	...	7.345240e-01	7.257016e-01	6.244603e-01	6.056471e-01	5.212781e-01	4.822270e-01	4.036325e-01	3.300833e-01	250.120109	0.041527
min	0.000000	-5.640751e+01	-7.271573e+01	-4.832559e+01	-5.683171e+00	-1.137433e+02	-2.616051e+01	-4.355724e+01	-7.321672e+01	-1.343407e+01	...	-3.483038e+01	-1.093314e+01	-4.480774e+01	-2.836627e+00	-1.029540e+01	-2.604551e+00	-2.256568e+01	-1.543008e+01	0.000000	0.000000
25%	54201.500000	-9.203734e-01	-5.985499e-01	-8.903648e-01	-8.486401e-01	-6.915971e-01	-7.682956e-01	-5.540759e-01	-2.086297e-01	-6.430976e-01	...	-2.283949e-01	-5.423504e-01	-1.618463e-01	-3.545861e-01	-3.171451e-01	-3.269839e-01	-7.083953e-02	-5.295979e-02	5.600000	0.000000
50%	84692.000000	1.810880e-02	6.548556e-02	1.798463e-01	-1.984653e-02	-5.433583e-02	-2.741871e-01	4.010308e-02	2.235804e-02	-5.142873e-02	...	-2.945017e-02	6.781943e-03	-1.119293e-02	4.097606e-02	1.659350e-02	-5.213911e-02	1.342146e-03	1.124383e-02	22.000000	0.000000
75%	139320.500000	1.315642e+00	8.037239e-01	1.027196e+00	7.433413e-01	6.119264e-01	3.985649e-01	5.704361e-01	3.273459e-01	5.971390e-01	...	1.863772e-01	5.285536e-01	1.476421e-01	4.395266e-01	3.507156e-01	2.409522e-01	9.104512e-02	7.827995e-02	77.165000	0.000000
max	172792.000000	2.454930e+00	2.205773e+01	9.382558e+00	1.687534e+01	3.480167e+01	7.330163e+01	1.205895e+02	2.000721e+01	1.559499e+01	...	2.720284e+01	1.050309e+01	2.252841e+01	4.584549e+00	7.519589e+00	3.517346e+00	3.161220e+01	3.384781e+01	25691.160000	1.000000

Fraudulent Transaction Detection¶

Introduction¶

Python Libraries¶

Data Import¶

Data Visualization¶

ML Methods¶

LogisticRegression (without handling the imbalance data)¶

Using SMOTE to handle imbalance data¶

side by side comparison¶

Using NearMiss to handle imbalance data¶