import numpy as np

num_samples = 100000

# draw random numbers for every feature
x_1 = np.random.uniform(low=18., high=80., size=(num_samples, 1))               #age
x_2 = np.random.uniform(low=4., high=30., size=(num_samples, 1))                #income (thousands)
x_3 = np.random.uniform(low=0., high=1., size=(num_samples, 1)) < .1            #self-employed or employed 

X = np.concatenate([x_1, x_2, x_3], axis=1)

sigmoid = lambda x: 1. / (1. + np.exp(-x))


# a linear model
b_0 = 1.
b_1 = .33
b_2 = -3.5
b_3 = 3.

p_1 = sigmoid(b_0 + b_1 * x_1 + b_2 * x_2 + b_3 * x_3)
y_1 = np.squeeze(np.random.uniform(size=(num_samples, 1)) < p_1)


# a non-linear model 
b_0 = -1.25
b_1 = 2.5
b_2 = -.25
b_3 = 1.

p_2 = sigmoid(b_0 + b_1 * ((x_1 < 25.) + (x_1 > 70.)) + b_2 * x_2 + b_3 * x_3)
y_2 = np.squeeze(np.random.uniform(size=(num_samples, 1)) < p_2)


# divide dataset into train- and testset
X_train_1 = X_train_2 = X[:80000, :]
X_test_1 = X_test_2 = X[80000:, :]
y_train_1 = y_1[:80000]
y_test_1 = y_1[80000:]
y_train_2 = y_2[:80000]
y_test_2 = y_2[80000:]
p_2_test = p_2[80000:, :]


# check if we have around 5% defaults on the whole dataset in for each model
print("Linear model: default rate {:.2f}%".format(np.mean(p_1) * 100.))
print("Non-linear model: default rate {:.2f}%".format(np.mean(p_2) * 100.))

Linear model: default rate 5.07%
Non-linear model: default rate 5.01%


from sklearn.linear_model import LogisticRegression

#logitic regression for the linear dataset
model_LR_1 = LogisticRegression().fit(X_train_1, y_train_1)

#a look at the coefficients
print(model_LR_1.coef_)
print(model_LR_1.intercept_)

#logitic regression for the non-linear dataset
model_LR_2 = LogisticRegression().fit(X_train_2, y_train_2)

print(model_LR_2.coef_)
print(model_LR_2.intercept_)

[[ 0.31791713 -3.41564217  2.9260502 ]]
[1.18700865]
[[ 0.00838581 -0.2220419   0.81800324]]
[-0.71313413]


from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

#calculate the scores on the linear testset
scores_LR_1 = model_LR_1.predict_proba(X_test_1)
#calculate fpr and tpr
fpr_LR_1, tpr_LR_1, thresholds_LR_1 = metrics.roc_curve(y_test_1, scores_LR_1[:, 1], pos_label=1)
#calculate auc
auc_LR_1 = roc_auc_score(y_test_1, scores_LR_1[:, 1])

#calculate the scores on the non-linear testset
scores_LR_2 = model_LR_2.predict_proba(X_test_2)
fpr_LR_2, tpr_LR_2, thresholds_LR_2 = metrics.roc_curve(y_test_2, scores_LR_2[:, 1], pos_label=1)
auc_LR_2 = roc_auc_score(y_test_2, scores_LR_2[:, 1])

#plot the roc curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_LR_1, tpr_LR_1, label='Linear model AUC={:.2f}'.format(auc_LR_1))
plt.plot(fpr_LR_2, tpr_LR_2, label='Non-linear model AUC={:.2f}'.format(auc_LR_2))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (logistic regression)')
plt.legend(loc='best')
plt.savefig("ROC_1.pdf", format="pdf")


import tensorflow as tf
from tensorflow import keras 

#build the nn model with 2 hidden layers
model_NN = keras.Sequential(
    [
     keras.layers.BatchNormalization(),
     keras.layers.Dense(32, activation='relu', name="hidden_layer_1"),
     keras.layers.Dense(32, activation='relu', name="hidden_layer_2"),
     keras.layers.Dense(1, activation='sigmoid', name="output_layer")
    ]
)

#the loss function
def total_deviance(y_true, y_pred):
    return tf.math.reduce_mean(-y_true * tf.math.log(y_pred) - (1. - y_true) * tf.math.log(1. - y_pred))

#pick an optimizer
model_NN.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=total_deviance
)

#shuffle the data during training
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_2, np.float32(y_train_2)))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(256)

#fir the model
model_NN.fit(
    train_dataset,
    batch_size=256,
    epochs=10
)

Epoch 1/10
313/313 [==============================] - 1s 2ms/step - loss: 0.3586
Epoch 2/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1650
Epoch 3/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1490
Epoch 4/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1442
Epoch 5/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1436
Epoch 6/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1430
Epoch 7/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1426
Epoch 8/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1423
Epoch 9/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1423
Epoch 10/10
313/313 [==============================] - 1s 2ms/step - loss: 0.1421

<tensorflow.python.keras.callbacks.History at 0x7fbffa0fe610>


#calculate the scores on the testdata
scores_NN = model_NN(X_test_2)
fpr_NN, tpr_NN, thresholds_NN = metrics.roc_curve(y_test_2, scores_NN[:, 0], pos_label=1)
auc_NN = roc_auc_score(y_test_2, scores_NN[:, 0])

plt.plot(fpr_NN, tpr_NN, label='NN non-linear model AUC={:.2f}'.format(auc_NN))
plt.plot(fpr_LR_2, tpr_LR_2, label='LR non-linear model AUC={:.2f}'.format(auc_LR_2))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (NN vs. LR)')
plt.legend(loc='best')
plt.savefig("ROC_NN.pdf", format="pdf")


from matplotlib.ticker import StrMethodFormatter

portfolio_size = 5000
loan_amount = 1000. 
rate = 0.055

#simulate the defaults on the loan portfolio
defaults = np.float32(np.random.uniform(low=0., high=1., size=(portfolio_size, 50000)) < p_2_test[:portfolio_size, :])


#p&l for scenario 1
p_and_l_a = np.sum(loan_amount * (rate * (1. - defaults) - defaults), axis=0)

#histogramm for scenario 1
plt.figure(1)
plt.hist(p_and_l_a, bins=150)
plt.gca().xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.savefig("hist_1.pdf", format="pdf")

print('Mean: {:.2f}'.format(np.mean(p_and_l_a)))
p_and_l_a = np.sort(p_and_l_a)
print('VaR: {:.2f}'.format(-p_and_l_a[499]))

#scenario 2

rate = 0.015

p_and_l_b_LR = np.sum(loan_amount * (rate * (1. - defaults) - defaults) * np.float32(scores_LR_2[:portfolio_size, 1:2] < 0.05), axis=0)
p_and_l_b_NN = np.sum(loan_amount * (rate * (1. - defaults) - defaults) * np.float32(scores_NN[:portfolio_size, :] < 0.05), axis=0)


plt.figure(2)
plt.hist(p_and_l_b_LR, bins=100, label='LR')
plt.hist(p_and_l_b_NN, bins=100, label='NN')
plt.gca().xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.legend()
plt.savefig("hist_2.pdf", format="pdf")

print('Mean (LR): {:.2f}'.format(np.mean(p_and_l_b_LR)))
p_and_l_b_LR = np.sort(p_and_l_b_LR)
print('VaR (LR): {:.2f}'.format(-p_and_l_b_LR[499]))

print('Mean (NN): {:.2f}'.format(np.mean(p_and_l_b_NN)))
p_and_l_b_NN = np.sort(p_and_l_b_NN)
print('VaR (NN): {:.2f}'.format(-p_and_l_b_NN[499]))

Mean: 9863.90
VaR: 23565.00
Mean (LR): 4298.28
VaR (LR): 11855.00
Mean (NN): 18154.19
VaR (NN): -3000.00

A toy default risk model¶

Dataset¶

Logistic regression¶

Roc curves for the logistic regression¶

Neural networks¶

Roc curves for the neural network¶

P&L Analysis¶