{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AI Tools for Actuaries\n",
    "## Chapter 5: LocalGLMnet in Python - PyTorch\n",
    "### Author: Marco Maggi, Mario Wuthrich\n",
    "### Version Summer School August 2025"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "from statsmodels.formula.api import glm\n",
    "\n",
    "pd.options.mode.chained_assignment = None\n",
    "\n",
    "# Set random seed\n",
    "rng = np.random.default_rng(500)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the data (load the data with the entity embeddings)\n",
    "df = pd.read_parquet(\"../../Data/freMTPL2freqEmb.parquet\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add random component\n",
    "df[\"RandN\"] = rng.normal(0, 1, size=len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = df[df[\"LearnTest\"] == \"L\"]\n",
    "test = df[df[\"LearnTest\"] == \"T\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pre-process data for LocalGLMnet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import (\n",
    "    FunctionTransformer,\n",
    "    OrdinalEncoder,\n",
    "    StandardScaler,\n",
    ")\n",
    "\n",
    "\n",
    "def clip_and_scale(upper):\n",
    "    pipe = Pipeline(\n",
    "        steps=[\n",
    "            (\"clip\", FunctionTransformer(lambda x: np.clip(x, a_min=0, a_max=upper))),\n",
    "            (\"scale\", StandardScaler()),\n",
    "        ]\n",
    "    )\n",
    "    return pipe\n",
    "\n",
    "\n",
    "density = Pipeline(\n",
    "    steps=[\n",
    "        (\"log\", FunctionTransformer(lambda x: np.log(x).round(2))),\n",
    "        (\"scale\", StandardScaler()),\n",
    "    ]\n",
    ")\n",
    "\n",
    "area = Pipeline(\n",
    "    steps=[\n",
    "        (\"encode\", OrdinalEncoder()),\n",
    "        (\"scale\", StandardScaler()),\n",
    "    ]\n",
    ")\n",
    "\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        (\n",
    "            \"clip_and_scale\",\n",
    "            clip_and_scale([20, 90, 150, 15]),\n",
    "            [\"VehAge\", \"DrivAge\", \"BonusMalus\", \"VehPower\"],\n",
    "        ),\n",
    "        (\n",
    "            \"scale\",\n",
    "            StandardScaler(),\n",
    "            [\"RegionEmb1\", \"RegionEmb2\", \"VehBrandEmb1\", \"VehBrandEmb2\"],\n",
    "        ),\n",
    "        (\"area\", area, [\"Area\"]),\n",
    "        (\"density\", density, [\"Density\"]),\n",
    "        (\n",
    "            \"veh_gas\",\n",
    "            FunctionTransformer(lambda x: (x == \"Diesel\").astype(np.float32)),\n",
    "            [\"VehGas\"],\n",
    "        ),\n",
    "        (\"veh_brand\", OrdinalEncoder(), [\"VehBrand\", \"Region\"]),\n",
    "        (\"passthrough\", \"passthrough\", [\"RandN\", \"ClaimNb\"]),\n",
    "    ],\n",
    "    verbose_feature_names_out=False,\n",
    ")\n",
    "\n",
    "\n",
    "# Just a check: Fit preprocessor to training data and apply to some lines from test\n",
    "preprocessor.set_output(transform=\"pandas\").fit(learn)\n",
    "preprocessor.transform(test.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Learning set size: {len(learn)}\")\n",
    "print(f\"Test set size: {len(test)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_learn = preprocessor.fit_transform(learn)\n",
    "X_learn.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GLM baseline analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We start with a baseline GLM to initialize the LocalGLMnet suitably\n",
    "\n",
    "# Features\n",
    "features = [\n",
    "    \"Area\",\n",
    "    \"VehPower\",\n",
    "    \"VehAge\",\n",
    "    \"DrivAge\",\n",
    "    \"BonusMalus\",\n",
    "    \"VehGas\",\n",
    "    \"Density\",\n",
    "    \"VehBrandEmb1\",\n",
    "    \"VehBrandEmb2\",\n",
    "    \"RegionEmb1\",\n",
    "    \"RegionEmb2\",\n",
    "    \"RandN\",\n",
    "]\n",
    "\n",
    "# Fit a Poisson GLM using the package statsmodels.\n",
    "# Set `ClaimNb` as response variable and `features` as covariates.\n",
    "# Use log `Exposure` as offset.\n",
    "model = None  # replace None with your code\n",
    "\n",
    "glm_results = model.fit()\n",
    "\n",
    "# Display model summary\n",
    "glm_results.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate deviance losses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = preprocessor.transform(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_poisson_deviance\n",
    "\n",
    "# Get predictions\n",
    "learn[\"GLM\"] = glm_results.predict(X_learn)\n",
    "test[\"GLM\"] = glm_results.predict(X_test)\n",
    "\n",
    "# Calculate in-sample and out-of-sample deviance\n",
    "learn_deviance = 100 * mean_poisson_deviance(\n",
    "    learn[\"ClaimNb\"] / learn[\"Exposure\"], learn[\"GLM\"], sample_weight=learn[\"Exposure\"]\n",
    ")\n",
    "test_deviance = 100 * mean_poisson_deviance(\n",
    "    test[\"ClaimNb\"] / test[\"Exposure\"], test[\"GLM\"], sample_weight=test[\"Exposure\"]\n",
    ")\n",
    "\n",
    "print(\"Deviance Losses:\")\n",
    "print(f\"Learning sample: {learn_deviance:.3f}\")\n",
    "print(f\"Test sample: {test_deviance:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LocalGLMnet\n",
    "We have performed all the preparatory work above, and now we dive into the LocalGLMnet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define the LocalGLMnet arichtecture (of depth 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch import nn\n",
    "from torch.nn import init\n",
    "\n",
    "simpel = nn.Linear(2, 4)\n",
    "simpel.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch import nn\n",
    "from torch.nn import init\n",
    "\n",
    "\n",
    "class FNN(nn.Module):\n",
    "    def __init__(self, seed, n_features, hidden_layers, intercept, glm_coefs):\n",
    "        super().__init__()\n",
    "        torch.manual_seed(seed)\n",
    "        self.hidden_layers = nn.ModuleList()\n",
    "        for i in range(len(hidden_layers)):\n",
    "            if i == 0:\n",
    "                self.hidden_layers.append(nn.Linear(n_features, hidden_layers[i]))\n",
    "            else:\n",
    "                self.hidden_layers.append(\n",
    "                    nn.Linear(hidden_layers[i - 1], hidden_layers[i])\n",
    "                )\n",
    "        # Define the last layer of the neural network which fits the\n",
    "        # `attention weights`\n",
    "        self.local_glm = None  # replace None with your code\n",
    "        # Initialize the weights and biases of the last layer such that the SGD\n",
    "        # will start from the MLE estimates of the GLM.\n",
    "        self.local_glm.bias.data = None  # replace None with your code\n",
    "        init.constant_(None, 0.0)  # replace None with your code\n",
    "        # Define the intercept as trainable parameter and initialize it with the GLM intercept.\n",
    "        self.intercept = nn.Parameter(None)  # replace None with your code\n",
    "\n",
    "    def forward(self, design, v, get_attentions=False):\n",
    "        # Implement the forward pass of the LocalGLMnet.\n",
    "        x = torch.tanh(self.hidden_layers[0](design))\n",
    "        for layer in self.hidden_layers[1:]:\n",
    "            x = None  # replace None with your code\n",
    "        x = None  # replace None with your code\n",
    "        if get_attentions:\n",
    "            pass  # replace pass with your code\n",
    "        skip_connection = torch.einsum(\"ij,ij->i\", x, design).unsqueeze(1)\n",
    "        x = None  # replace None with your code\n",
    "        return torch.exp(x).flatten() * v\n",
    "\n",
    "\n",
    "SEED = 21456783\n",
    "M_FEAT = len(features)  # number of features\n",
    "HIDDEN = [20, 15, 10]\n",
    "# Create model with three hidden layers\n",
    "model = FNN(\n",
    "    SEED,\n",
    "    n_features=M_FEAT,\n",
    "    hidden_layers=HIDDEN,\n",
    "    intercept=glm_results.params.iloc[0],\n",
    "    glm_coefs=glm_results.params.to_numpy()[1:],\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check that the LocalGLMnet before training replicates the GLM."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Backtest the initalization\n",
    "model.eval()\n",
    "X_test_tensor = torch.tensor(X_test[features].values, dtype=torch.float32)\n",
    "X_learn_tensor = torch.tensor(X_learn[features].values, dtype=torch.float32)\n",
    "exposure_test_tensor = torch.tensor(\n",
    "    test[\"Exposure\"].astype(\"float32\").values, dtype=torch.float32\n",
    ")\n",
    "exposure_learn_tensor = torch.tensor(\n",
    "    learn[\"Exposure\"].astype(\"float32\").values, dtype=torch.float32\n",
    ")\n",
    "\n",
    "# call the LocalGLMnet to get predictions on test and learn data.\n",
    "# replace None with your code.\n",
    "test_GLM = None.detach().numpy()\n",
    "learn_GLM = None.detach().numpy()\n",
    "\n",
    "# Exposure\n",
    "V_learn = learn[\"Exposure\"]\n",
    "V_test = test[\"Exposure\"]\n",
    "\n",
    "# Response\n",
    "Y_learn = learn[\"ClaimNb\"]\n",
    "Y_test = test[\"ClaimNb\"]\n",
    "\n",
    "poisson_deviance_train_glm = 100 * mean_poisson_deviance(\n",
    "    Y_learn / V_learn, learn_GLM / V_learn, sample_weight=V_learn\n",
    ")\n",
    "poisson_deviance_test_glm = 100 * mean_poisson_deviance(\n",
    "    Y_test / V_test, test_GLM / V_test, sample_weight=V_test\n",
    ")\n",
    "print(\n",
    "    \"Poisson Deviance (Train, Test):\",\n",
    "    round(poisson_deviance_train_glm, 3),\n",
    "    round(poisson_deviance_test_glm, 3),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train the LocalGLMnet model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from copy import deepcopy\n",
    "\n",
    "\n",
    "def train_model(\n",
    "    model,\n",
    "    X_train,\n",
    "    y_train,\n",
    "    v_train,\n",
    "    X_val,\n",
    "    y_val,\n",
    "    v_val,\n",
    "    optimizer,\n",
    "    checkpoint_path,\n",
    "    batch_size,\n",
    "    n_epochs=100,\n",
    "):\n",
    "    loss_fn = nn.PoissonNLLLoss(log_input=False, reduction=\"sum\")\n",
    "    best_val_loss = float(\"inf\")\n",
    "    history = {\"loss\": [], \"val_loss\": []}\n",
    "\n",
    "    # Create dataset indices for batching\n",
    "    num_batches = (len(X_train) + batch_size - 1) // batch_size\n",
    "\n",
    "    for epoch in range(n_epochs):\n",
    "        # Training phase\n",
    "        model.train()\n",
    "        epoch_loss = 0.0\n",
    "        # Indices can be shuffled for each epoch. We don't do it here.\n",
    "        indices = torch.arange(len(X_train))\n",
    "\n",
    "        for i in range(num_batches):\n",
    "            # Get batch indices\n",
    "            batch_indices = indices[\n",
    "                i * batch_size : min((i + 1) * batch_size, len(X_train))\n",
    "            ]\n",
    "\n",
    "            # Get batch data\n",
    "            X_batch = X_train[batch_indices]\n",
    "            v_batch = v_train[batch_indices]\n",
    "            y_batch = y_train[batch_indices]\n",
    "\n",
    "            # Forward pass\n",
    "            pred_batch = model(X_batch, v_batch)\n",
    "            loss = loss_fn(pred_batch, y_batch)\n",
    "\n",
    "            # Backward pass and optimize\n",
    "            optimizer.zero_grad()\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "            epoch_loss += loss.item()\n",
    "\n",
    "        # Average loss for the epoch\n",
    "        epoch_loss /= v_train.sum().item()\n",
    "        history[\"loss\"].append(epoch_loss)\n",
    "\n",
    "        # Validation phase\n",
    "        model.eval()\n",
    "        with torch.no_grad():\n",
    "            pred_val = model(X_val, v_val)\n",
    "            val_loss = (loss_fn(pred_val, y_val) / v_val.sum()).item()\n",
    "            history[\"val_loss\"].append(val_loss)\n",
    "\n",
    "        # Store best model\n",
    "        if val_loss < best_val_loss and isinstance(checkpoint_path, str):\n",
    "            best_val_loss = val_loss\n",
    "            best_model = deepcopy(model)\n",
    "\n",
    "        # Print progress\n",
    "        if (epoch + 1) % 10 == 0:\n",
    "            print(\n",
    "                f\"Epoch {epoch + 1}/{n_epochs}, Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}\"\n",
    "            )\n",
    "    torch.save(best_model.state_dict(), checkpoint_path)\n",
    "    return history\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "convert_to_tensor = lambda x: torch.tensor(x.values, dtype=torch.float32)\n",
    "train, val = train_test_split(learn, test_size=0.1, random_state=125548)\n",
    "\n",
    "X_learn = convert_to_tensor(preprocessor.transform(learn)[features])\n",
    "X_train = convert_to_tensor(preprocessor.transform(train)[features])\n",
    "X_val = convert_to_tensor(preprocessor.transform(val)[features])\n",
    "X_test = convert_to_tensor(preprocessor.transform(test)[features])\n",
    "\n",
    "y_learn, v_learn = convert_to_tensor(learn.ClaimNb), convert_to_tensor(learn.Exposure)\n",
    "y_train, v_train = convert_to_tensor(train.ClaimNb), convert_to_tensor(train.Exposure)\n",
    "y_val, v_val = convert_to_tensor(val.ClaimNb), convert_to_tensor(val.Exposure)\n",
    "y_test, v_test = convert_to_tensor(test.ClaimNb), convert_to_tensor(test.Exposure)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = torch.optim.NAdam(model.parameters())\n",
    "checkpoint_path = f\"./Networks/LocalGLMnet_{SEED}.pt\"\n",
    "history = train_model(\n",
    "    model,\n",
    "    X_train,\n",
    "    y_train,\n",
    "    v_train,\n",
    "    X_val,\n",
    "    y_val,\n",
    "    v_val,\n",
    "    optimizer,\n",
    "    checkpoint_path,\n",
    "    batch_size=5_000,\n",
    "    n_epochs=100,\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot training history (vertical line at best validation loss)\n",
    "fig = (\n",
    "    pd.DataFrame({\"loss\": history[\"loss\"], \"val_loss\": history[\"val_loss\"]})\n",
    "    .rename(columns={\"loss\": \"Training\", \"val_loss\": \"Validation\"})\n",
    "    .plot(xlabel=\"Epoch - 1\", ylabel=\"Loss\", title=\"Loss During Training\", grid=True)\n",
    ")\n",
    "_ = fig.axvline(np.argmin(history[\"val_loss\"]), color=\"black\", linestyle=\"--\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LocalGLMnet results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_poisson_deviance\n",
    "\n",
    "\n",
    "# Helper functions to evaluate the model via average Poisson deviance\n",
    "def score(model, X, y, v):\n",
    "    \"\"\"Evaluate the model using sklearn's mean_poisson_deviance.\"\"\"\n",
    "    pred = model(X, v).detach().numpy()\n",
    "    return 100 * mean_poisson_deviance(\n",
    "        y.detach().numpy() / v, pred / v, sample_weight=v\n",
    "    )\n",
    "\n",
    "\n",
    "# Load best weights and evaluate\n",
    "model.load_state_dict(torch.load(checkpoint_path))\n",
    "model.eval()\n",
    "\n",
    "print(\"===GLM===\")\n",
    "print(\n",
    "    \"Poisson Deviance (Train, Test):\",\n",
    "    round(poisson_deviance_train_glm, 3),\n",
    "    round(poisson_deviance_test_glm, 3),\n",
    ")\n",
    "print(\"===LocalGLMnet===\")\n",
    "print(f\"Poisson Deviance (Learn): {score(model, X_learn, y_learn, v_learn):.3f}\")\n",
    "print(f\"Poisson Deviance (Test): {score(model, X_test, y_test, v_test):.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Illustrate the LocalGLMnet results: extract attention weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# replace None with your code\n",
    "attention_weights = None.detach().numpy()\n",
    "attention_df = pd.DataFrame(attention_weights, columns=features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Boxplot of the attention weights (RandNX is unrelated to the response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate standard deviation of RandNX (which does not impact the response)\n",
    "randnx_std = attention_df[\"RandN\"].std()\n",
    "threshold = 2.576 * randnx_std\n",
    "\n",
    "# Plot attention weights\n",
    "plt.figure(figsize=(12, 6))\n",
    "attention_df.boxplot()\n",
    "plt.xticks(rotation=45)\n",
    "plt.title(\"boxplot of attention weights\")\n",
    "plt.axhline(y=threshold, color=\"r\", linestyle=\"--\", label=\"+2.576 std dev (99% CI)\")\n",
    "plt.axhline(y=-threshold, color=\"r\", linestyle=\"--\", label=\"-2.576 std dev (99% CI)\")\n",
    "plt.axhline(y=0, color=\"k\", linestyle=\"-\", label=\"zero line\")\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute importance measure for all variables/terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate and plot the variable importance measure"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot individual attention weights for selected variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to create individual attention weight plots\n",
    "def plot_attention_weights(feature_name, alpha):\n",
    "    # Get unique values for x-axis\n",
    "    x_values = np.sort(test[feature_name].unique())\n",
    "\n",
    "    # Create the plot\n",
    "    plt.figure(figsize=(10, 6))\n",
    "\n",
    "    # Plot the attention weights\n",
    "    plt.scatter(\n",
    "        test[feature_name],\n",
    "        attention_df[feature_name],\n",
    "        alpha=0.5,\n",
    "        s=20,\n",
    "        label=\"attention weights\",\n",
    "    )\n",
    "\n",
    "    # Add reference lines\n",
    "    plt.axhline(y=0, color=\"cyan\", linestyle=\"-\", label=\"zero line\")\n",
    "    plt.axhline(\n",
    "        y=0.674 * randnx_std,\n",
    "        color=\"orange\",\n",
    "        linestyle=\"-\",\n",
    "        label=\"0.674 std.dev. (50%)\",\n",
    "    )\n",
    "    plt.axhline(y=-0.674 * randnx_std, color=\"orange\", linestyle=\"-\")\n",
    "    plt.axhline(\n",
    "        y=2.576 * randnx_std, color=\"red\", linestyle=\"-\", label=\"2.576 std.dev. (99%)\"\n",
    "    )\n",
    "    plt.axhline(y=-2.576 * randnx_std, color=\"red\", linestyle=\"-\")\n",
    "\n",
    "    # Add shaded area\n",
    "    plt.fill_between(\n",
    "        [test[feature_name].min(), test[feature_name].max()],\n",
    "        [-0.674 * randnx_std, -0.674 * randnx_std],\n",
    "        [0.674 * randnx_std, 0.674 * randnx_std],\n",
    "        color=\"orange\",\n",
    "        alpha=0.3,\n",
    "    )\n",
    "\n",
    "    # Add local regression fit\n",
    "    from statsmodels.nonparametric.smoothers_lowess import lowess\n",
    "\n",
    "    # Sort the data for local regression\n",
    "    sorted_indices = np.argsort(test[feature_name])\n",
    "    x_sorted = test[feature_name].iloc[sorted_indices]\n",
    "    y_sorted = attention_df[feature_name].iloc[sorted_indices]\n",
    "\n",
    "    # Fit local regression\n",
    "    lowess_fit = lowess(y_sorted, x_sorted, frac=alpha, it=3)\n",
    "\n",
    "    # Plot the local regression fit\n",
    "    plt.plot(\n",
    "        lowess_fit[:, 0],\n",
    "        lowess_fit[:, 1],\n",
    "        color=\"lightgreen\",\n",
    "        label=\"local regression fit\",\n",
    "    )\n",
    "\n",
    "    # Customize the plot\n",
    "    plt.title(f\"attention weights: {feature_name}\", fontsize=14)\n",
    "    plt.xlabel(feature_name, fontsize=12)\n",
    "    plt.ylabel(\"attention weights\", fontsize=12)\n",
    "    plt.legend(loc=\"lower right\")\n",
    "\n",
    "    # Set y-axis limits\n",
    "    # ylim0 = max(abs(attention_df[feature_name]))\n",
    "    ylim0 = np.max(np.abs(attention_df))\n",
    "    plt.ylim(-ylim0, ylim0)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is the purely random variable not impacting the response\n",
    "# we perform a local regression which is a bit time consuming\n",
    "plot_attention_weights(\"RandN\", 0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BonusMalus is the most significant term\n",
    "# the local regression is not fully sensible because BonusMalus values cluster at the lowest level\n",
    "plot_attention_weights(\"BonusMalus\", 0.6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Driver age variabel\n",
    "plot_attention_weights(\"DrivAge\", 0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "col1 = \"DrivAge\"\n",
    "col2 = \"BonusMalus\"\n",
    "y = attention_df[col1]\n",
    "mask = rng.choice(range(attention_df.shape[0]), size=5000, replace=False)\n",
    "x = test[col1]\n",
    "color = test[col2]\n",
    "color_median = 60\n",
    "color_binary = (color >= color_median).astype(int)\n",
    "q1, q2, q3 = 51, 65, 85\n",
    "color_bins = pd.cut(\n",
    "    color, bins=[-np.inf, q1, q2, q3, np.inf], labels=[0, 1, 2, 3], include_lowest=True\n",
    ").astype(int)\n",
    "fig, ax = plt.subplots(figsize=(8, 6))\n",
    "# ax.set_title(col)\n",
    "scatter = ax.scatter(\n",
    "    x.values[mask],\n",
    "    y.values[mask],\n",
    "    s=1,\n",
    "    c=color_bins.values[mask],\n",
    "    cmap=\"RdYlBu\",\n",
    "    alpha=0.8,\n",
    ")\n",
    "cbar = plt.colorbar(scatter, ax=ax)\n",
    "cbar.set_ticks([0, 1, 2, 3])\n",
    "cbar.set_ticklabels(\n",
    "    [\n",
    "        f\"< {q1:.1f}\",\n",
    "        f\"{q1:.1f}-{q2:.1f}\",\n",
    "        f\"{q2:.1f}-{q3:.1f}\",\n",
    "        f\">= {q3:.1f}\",\n",
    "    ]\n",
    ")\n",
    "cbar.set_label(f\"{col2}\")\n",
    "ax.set_ylabel(\"$\\\\beta(\\\\boldsymbol{x})$\")\n",
    "ax.set_xlabel(col1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Gradient of interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.autograd as autograd\n",
    "\n",
    "n, p = X_test.shape\n",
    "gradients = np.empty((p, n, p))\n",
    "input_tensor = X_test\n",
    "\n",
    "input_tensor.requires_grad = True\n",
    "attentions = model(X_test, v_test, get_attentions=True)\n",
    "for i in range(p):\n",
    "    grad_scaling = torch.ones_like(attentions[:, i])\n",
    "    gradient_i = autograd.grad(\n",
    "        attentions[:, i], input_tensor, grad_scaling, create_graph=True\n",
    "    )\n",
    "    gradients[i, :, :] = gradient_i[0].numpy(force=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the gradient of the attention for the variable `DrivAge`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ensemble"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train the same neural network using different random seeds, which impact the initial weights of the hidden layers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "seeds = [\n",
    "    1752305036,\n",
    "    4284935567,\n",
    "    909886011,\n",
    "    4253642063,\n",
    "    3875387572,\n",
    "    2984734056,\n",
    "    56601707,\n",
    "    803726624,\n",
    "    215740934,\n",
    "    1236640324,\n",
    "]\n",
    "for seed in seeds:\n",
    "    model = None  # replace None with your code\n",
    "    optimizer = torch.optim.NAdam(model.parameters())\n",
    "    checkpoint_path = f\"./Networks/LocalGLMnet_{seed}.pt\"\n",
    "    history = None  # replace None with your code\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the derivatives of the attentions with respect to the input features for each trained network, then take the average over all trained networks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n, p = X_test.shape\n",
    "gradients = np.empty((10, p, n, p))\n",
    "input_tensor = X_test\n",
    "input_tensor.requires_grad = True\n",
    "for k, seed in enumerate(seeds):\n",
    "    checkpoint_path = f\"./Networks/LocalGLMnet_{seed}.pt\"\n",
    "    model.load_state_dict(torch.load(checkpoint_path))\n",
    "    attentions = model(X_test, v_test, get_attentions=True)\n",
    "    for i in range(p):\n",
    "        grad_scaling = torch.ones_like(attentions[:, i])\n",
    "        gradient_i = autograd.grad(\n",
    "            attentions[:, i], input_tensor, grad_scaling, create_graph=True\n",
    "        )\n",
    "        gradients[k, i, :, :] = gradient_i[0].numpy(force=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the gradient of the attention for the variable `DrivAge`, now based on the ensemble model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the derivative of the attention for the variable `DrivAge` with respect to `BonusMalus`, showing the results of each model of the ensemble"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}