1 files changed, 523 insertions, 146 deletions
diff --git a/src/notebooks/00-testing-stuff-out.ipynb b/src/notebooks/00-testing-stuff-out.ipynb
index 6f01dfb..dd02098 100644
--- a/src/notebooks/00-testing-stuff-out.ipynb
+++ b/src/notebooks/00-testing-stuff-out.ipynb
@@ -2,18 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
@@ -22,6 +13,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "from PIL import Image\n",
+    "import torch.nn.functional as F\n",
     "import torch\n",
     "from torch import nn\n",
     "from importlib.util import find_spec\n",
@@ -32,74 +24,386 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from text_recognizer.networks.residual_network import IdentityBlock, ResidualBlock, BasicBlock, BottleNeckBlock, ResidualLayer, ResidualNetwork, ResidualNetworkEncoder"
+    "from text_recognizer.networks import CTCTransformer"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from text_recognizer.networks import WideResidualNetwork"
+    "model = CTCTransformer(\n",
+    "        num_encoder_layers=2,\n",
+    "        hidden_dim=256,\n",
+    "        vocab_size=56,\n",
+    "        num_heads=8,\n",
+    "        adaptive_pool_dim=[None, 1],\n",
+    "        expansion_dim=2048,\n",
+    "        dropout_rate=0.1,\n",
+    "        max_len=256,\n",
+    "        patch_size=(28, 32),\n",
+    "        stride=(1, 28),\n",
+    "        activation=\"gelu\",\n",
+    "        backbone=\"WideResidualNetwork\",\n",
+    "backbone_args={\n",
+    "            \"in_channels\": 1,\n",
+    "            \"in_planes\": 64,\n",
+    "            \"num_classes\": 80,\n",
+    "            \"depth\": 10,\n",
+    "            \"width_factor\": 1,\n",
+    "            \"dropout_rate\": 0.1,\n",
+    "            \"num_layers\": 4,\n",
+    "            \"num_stages\": [64, 128, 256, 256],\n",
+    "            \"activation\": \"elu\",\n",
+    "            \"use_decoder\": False,\n",
+    "},\n",
+    " )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pathlib import Path"
+    "backbone: WideResidualNetwork\n",
+    "          backbone_args:\n",
+    "            in_channels: 1\n",
+    "            in_planes: 64\n",
+    "            num_classes: 80\n",
+    "            depth: 10\n",
+    "            width_factor: 1\n",
+    "            dropout_rate: 0.1\n",
+    "            num_layers: 4 \n",
+    "            num_stages: [64, 128, 256, 256]\n",
+    "            activation: elu\n",
+    "            use_decoder: false\n",
+    "          n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = torch.randn(2, 1, 28, 952)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "True"
+       "torch.Size([119, 2, 56])"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "Path(\"/home/akternurra/Documents/projects/quest-for-general-artifical-intelligence/projects/text-recognizer/src/training/experiments/TransformerModel_EmnistLinesDataset_CNNTransformer/1112_081300/model/best.pt\").exists()"
+    "model(t).shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Failed to run torchsummary. See above stack traces for more details. Executed layers up to: [WideResidualNetwork: 1-1, Sequential: 2-1, Conv2d: 3-1, Sequential: 3-2, WideBlock: 4-1, Sequential: 3-3, WideBlock: 4-2, Sequential: 3-4, WideBlock: 4-3, Sequential: 3-5, WideBlock: 4-4, AdaptiveAvgPool2d: 1-2, Encoder: 1-3, EncoderLayer: 3-6, MultiHeadAttention: 4-5, _IntraLayerConnection: 4-6, _ConvolutionalLayer: 4-7, _IntraLayerConnection: 4-8, EncoderLayer: 3-7, MultiHeadAttention: 4-9, _IntraLayerConnection: 4-10, _ConvolutionalLayer: 4-11, _IntraLayerConnection: 4-12, LayerNorm: 2-2, Linear: 2-3, GLU: 2-4]",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m----------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                   Traceback (most recent call last)",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torchsummary/torchsummary.py\u001b[0m in \u001b[0;36msummary\u001b[0;34m(model, input_data, batch_dim, branching, col_names, col_width, depth, device, dtypes, verbose, *args, **kwargs)\u001b[0m\n\u001b[1;32m    123\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 124\u001b[0;31m                 \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    125\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n",
+      "\u001b[0;32m~/Documents/projects/quest-for-general-artifical-intelligence/projects/text-recognizer/src/text_recognizer/networks/ctc_transformer.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x, trg)\u001b[0m\n\u001b[1;32m    109\u001b[0m         \u001b[0mcontext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontext_representation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimage_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 110\u001b[0;31m         \u001b[0mlogits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    111\u001b[0m         \u001b[0mlogits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrearrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"b t y -> t b y\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torch/nn/modules/container.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    116\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m             \u001b[0minput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodule\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    118\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torch/nn/modules/linear.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m     92\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbias\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     94\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mlinear\u001b[0;34m(input, weight, bias)\u001b[0m\n\u001b[1;32m   1691\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1692\u001b[0;31m         \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatmul\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1693\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mbias\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: mat1 and mat2 shapes cannot be multiplied (238x128 and 256x56)",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mRuntimeError\u001b[0m                   Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-8-85c5209ae40a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m28\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m952\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"cpu\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdepth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/.pyenv/versions/3.8.2/envs/text-recognizer/lib/python3.8/site-packages/torchsummary/torchsummary.py\u001b[0m in \u001b[0;36msummary\u001b[0;34m(model, input_data, batch_dim, branching, col_names, col_width, depth, device, dtypes, verbose, *args, **kwargs)\u001b[0m\n\u001b[1;32m    125\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    126\u001b[0m             \u001b[0mexecuted_layers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mlayer\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlayer\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msummary_list\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlayer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecuted\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m             raise RuntimeError(\n\u001b[0m\u001b[1;32m    128\u001b[0m                 \u001b[0;34m\"Failed to run torchsummary. See above stack traces for more details. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    129\u001b[0m                 \u001b[0;34m\"Executed layers up to: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexecuted_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Failed to run torchsummary. See above stack traces for more details. Executed layers up to: [WideResidualNetwork: 1-1, Sequential: 2-1, Conv2d: 3-1, Sequential: 3-2, WideBlock: 4-1, Sequential: 3-3, WideBlock: 4-2, Sequential: 3-4, WideBlock: 4-3, Sequential: 3-5, WideBlock: 4-4, AdaptiveAvgPool2d: 1-2, Encoder: 1-3, EncoderLayer: 3-6, MultiHeadAttention: 4-5, _IntraLayerConnection: 4-6, _ConvolutionalLayer: 4-7, _IntraLayerConnection: 4-8, EncoderLayer: 3-7, MultiHeadAttention: 4-9, _IntraLayerConnection: 4-10, _ConvolutionalLayer: 4-11, _IntraLayerConnection: 4-12, LayerNorm: 2-2, Linear: 2-3, GLU: 2-4]"
+     ]
+    }
+   ],
+   "source": [
+    "summary(model, (1, 28, 952), device=\"cpu\", depth=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GEGLU(nn.Module):\n",
+    "    def __init__(self, dim_in, dim_out):\n",
+    "        super().__init__()\n",
+    "        self.proj = nn.Linear(dim_in, dim_out * 2)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x, gate = self.proj(x).chunk(2, dim = -1)\n",
+    "        return x * F.gelu(gate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "e = GEGLU(256, 2048)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 30, 2048])"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "e(t).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emb = nn.Embedding(56, 256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    e = emb(torch.Tensor([55]).long())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from einops import repeat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ee = repeat(e, \"() n -> b n\", b=16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleAttributeError",
+     "evalue": "'Embedding' object has no attribute 'device'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleAttributeError\u001b[0m                      Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-58-657f11e4a017>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0memb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/.cache/pypoetry/virtualenvs/text-recognizer-N1c_zsdp-py3.8/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m    776\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    777\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 778\u001b[0;31m         raise ModuleAttributeError(\"'{}' object has no attribute '{}'\".format(\n\u001b[0m\u001b[1;32m    779\u001b[0m             type(self).__name__, name))\n\u001b[1;32m    780\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleAttributeError\u001b[0m: 'Embedding' object has no attribute 'device'"
+     ]
+    }
+   ],
+   "source": [
+    "emb.device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-1.0624,  0.0674,  0.9387,  ..., -0.1852, -0.1303,  0.8005],\n",
+       "        [-1.0624,  0.0674,  0.9387,  ..., -0.1852, -0.1303,  0.8005],\n",
+       "        [-1.0624,  0.0674,  0.9387,  ..., -0.1852, -0.1303,  0.8005],\n",
+       "        ...,\n",
+       "        [-1.0624,  0.0674,  0.9387,  ..., -0.1852, -0.1303,  0.8005],\n",
+       "        [-1.0624,  0.0674,  0.9387,  ..., -0.1852, -0.1303,  0.8005],\n",
+       "        [-1.0624,  0.0674,  0.9387,  ..., -0.1852, -0.1303,  0.8005]])"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ee"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([16, 256])"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ee.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = torch.randn(16, 10, 256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([16, 10, 256])"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = torch.cat((ee.unsqueeze(1), t, ee.unsqueeze(1)), dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([16, 12, 256])"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "False"
+       "torch.Size([1, 256])"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "Path(\"/home/akternurra/Documents/projects/quest-for-general-artifical-intelligence/projects/text-recognizer/src/training/experiments/TransformerModel_EmnistLinesDataset_CNNTransformer/1112_201649/model/best.pt\").exists()"
+    "e.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from text_recognizer.networks.residual_network import IdentityBlock, ResidualBlock, BasicBlock, BottleNeckBlock, ResidualLayer, ResidualNetwork, ResidualNetworkEncoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from text_recognizer.networks import WideResidualNetwork"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -109,16 +413,17 @@
     "            in_planes=64,\n",
     "            depth=10,\n",
     "            num_layers=4,\n",
-    "            width_factor=1,\n",
-    "            dropout_rate= 0.2,\n",
+    "            width_factor=2,\n",
+    "            num_stages=[64, 128, 256, 256],\n",
+    "            dropout_rate= 0.1,\n",
     "            activation= \"SELU\",\n",
-    "            use_decoder= True,\n",
+    "            use_decoder= False,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -127,16 +432,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
-    "backbone = ResidualNetworkEncoder(1, [64, 128, 256], [2, 2, 3])"
+    "backbone = ResidualNetworkEncoder(1, [64, 65, 66, 67, 68], [2, 2, 2, 2, 2])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 52,
    "metadata": {},
    "outputs": [
     {
@@ -146,27 +451,31 @@
       "==========================================================================================\n",
       "Layer (type:depth-idx)                   Output Shape              Param #\n",
       "==========================================================================================\n",
-      "├─Sequential: 1-1                        [-1, 64, 28, 952]         --\n",
-      "|    └─Conv2d: 2-1                       [-1, 64, 28, 952]         576\n",
-      "|    └─BatchNorm2d: 2-2                  [-1, 64, 28, 952]         128\n",
-      "|    └─ReLU: 2-3                         [-1, 64, 28, 952]         --\n",
-      "├─Sequential: 1-2                        [-1, 256, 7, 238]         --\n",
-      "|    └─ResidualLayer: 2-4                [-1, 64, 28, 952]         --\n",
-      "|    |    └─Sequential: 3-1              [-1, 64, 28, 952]         147,968\n",
-      "|    └─ResidualLayer: 2-5                [-1, 128, 14, 476]        --\n",
-      "|    |    └─Sequential: 3-2              [-1, 128, 14, 476]        525,568\n",
-      "|    └─ResidualLayer: 2-6                [-1, 256, 7, 238]         --\n",
-      "|    |    └─Sequential: 3-3              [-1, 256, 7, 238]         3,280,384\n",
+      "├─Sequential: 1-1                        [-1, 64, 12, 474]         --\n",
+      "|    └─Conv2d: 2-1                       [-1, 64, 12, 474]         3,136\n",
+      "|    └─BatchNorm2d: 2-2                  [-1, 64, 12, 474]         128\n",
+      "|    └─ReLU: 2-3                         [-1, 64, 12, 474]         --\n",
+      "├─Sequential: 1-2                        [-1, 68, 1, 30]           --\n",
+      "|    └─ResidualLayer: 2-4                [-1, 64, 12, 474]         --\n",
+      "|    |    └─Sequential: 3-1              [-1, 64, 12, 474]         147,968\n",
+      "|    └─ResidualLayer: 2-5                [-1, 65, 6, 237]          --\n",
+      "|    |    └─Sequential: 3-2              [-1, 65, 6, 237]          156,325\n",
+      "|    └─ResidualLayer: 2-6                [-1, 66, 3, 119]          --\n",
+      "|    |    └─Sequential: 3-3              [-1, 66, 3, 119]          161,172\n",
+      "|    └─ResidualLayer: 2-7                [-1, 67, 2, 60]           --\n",
+      "|    |    └─Sequential: 3-4              [-1, 67, 2, 60]           166,093\n",
+      "|    └─ResidualLayer: 2-8                [-1, 68, 1, 30]           --\n",
+      "|    |    └─Sequential: 3-5              [-1, 68, 1, 30]           171,088\n",
       "==========================================================================================\n",
-      "Total params: 3,954,624\n",
-      "Trainable params: 3,954,624\n",
+      "Total params: 805,910\n",
+      "Trainable params: 805,910\n",
       "Non-trainable params: 0\n",
-      "Total mult-adds (M): 31.16\n",
+      "Total mult-adds (M): 21.05\n",
       "==========================================================================================\n",
       "Input size (MB): 0.10\n",
-      "Forward/backward pass size (MB): 26.03\n",
-      "Params size (MB): 15.09\n",
-      "Estimated Total Size (MB): 41.22\n",
+      "Forward/backward pass size (MB): 5.55\n",
+      "Params size (MB): 3.07\n",
+      "Estimated Total Size (MB): 8.73\n",
       "==========================================================================================\n"
      ]
     },
@@ -176,31 +485,35 @@
        "==========================================================================================\n",
        "Layer (type:depth-idx)                   Output Shape              Param #\n",
        "==========================================================================================\n",
-       "├─Sequential: 1-1                        [-1, 64, 28, 952]         --\n",
-       "|    └─Conv2d: 2-1                       [-1, 64, 28, 952]         576\n",
-       "|    └─BatchNorm2d: 2-2                  [-1, 64, 28, 952]         128\n",
-       "|    └─ReLU: 2-3                         [-1, 64, 28, 952]         --\n",
-       "├─Sequential: 1-2                        [-1, 256, 7, 238]         --\n",
-       "|    └─ResidualLayer: 2-4                [-1, 64, 28, 952]         --\n",
-       "|    |    └─Sequential: 3-1              [-1, 64, 28, 952]         147,968\n",
-       "|    └─ResidualLayer: 2-5                [-1, 128, 14, 476]        --\n",
-       "|    |    └─Sequential: 3-2              [-1, 128, 14, 476]        525,568\n",
-       "|    └─ResidualLayer: 2-6                [-1, 256, 7, 238]         --\n",
-       "|    |    └─Sequential: 3-3              [-1, 256, 7, 238]         3,280,384\n",
+       "├─Sequential: 1-1                        [-1, 64, 12, 474]         --\n",
+       "|    └─Conv2d: 2-1                       [-1, 64, 12, 474]         3,136\n",
+       "|    └─BatchNorm2d: 2-2                  [-1, 64, 12, 474]         128\n",
+       "|    └─ReLU: 2-3                         [-1, 64, 12, 474]         --\n",
+       "├─Sequential: 1-2                        [-1, 68, 1, 30]           --\n",
+       "|    └─ResidualLayer: 2-4                [-1, 64, 12, 474]         --\n",
+       "|    |    └─Sequential: 3-1              [-1, 64, 12, 474]         147,968\n",
+       "|    └─ResidualLayer: 2-5                [-1, 65, 6, 237]          --\n",
+       "|    |    └─Sequential: 3-2              [-1, 65, 6, 237]          156,325\n",
+       "|    └─ResidualLayer: 2-6                [-1, 66, 3, 119]          --\n",
+       "|    |    └─Sequential: 3-3              [-1, 66, 3, 119]          161,172\n",
+       "|    └─ResidualLayer: 2-7                [-1, 67, 2, 60]           --\n",
+       "|    |    └─Sequential: 3-4              [-1, 67, 2, 60]           166,093\n",
+       "|    └─ResidualLayer: 2-8                [-1, 68, 1, 30]           --\n",
+       "|    |    └─Sequential: 3-5              [-1, 68, 1, 30]           171,088\n",
        "==========================================================================================\n",
-       "Total params: 3,954,624\n",
-       "Trainable params: 3,954,624\n",
+       "Total params: 805,910\n",
+       "Trainable params: 805,910\n",
        "Non-trainable params: 0\n",
-       "Total mult-adds (M): 31.16\n",
+       "Total mult-adds (M): 21.05\n",
        "==========================================================================================\n",
        "Input size (MB): 0.10\n",
-       "Forward/backward pass size (MB): 26.03\n",
-       "Params size (MB): 15.09\n",
-       "Estimated Total Size (MB): 41.22\n",
+       "Forward/backward pass size (MB): 5.55\n",
+       "Params size (MB): 3.07\n",
+       "Estimated Total Size (MB): 8.73\n",
        "=========================================================================================="
       ]
      },
-     "execution_count": 20,
+     "execution_count": 52,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -211,7 +524,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -222,7 +535,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -231,29 +544,34 @@
        "Sequential(\n",
        "  (0): SELU(inplace=True)\n",
        "  (1): Sequential(\n",
-       "    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "    (1): Sequential(\n",
+       "    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "    (2): SELU(inplace=True)\n",
+       "    (3): MaxPool2d(kernel_size=(2, 4), stride=2, padding=1, dilation=1, ceil_mode=False)\n",
+       "  )\n",
+       "  (2): Sequential(\n",
+       "    (0): Sequential(\n",
        "      (0): WideBlock(\n",
        "        (activation): SELU(inplace=True)\n",
        "        (blocks): Sequential(\n",
        "          (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (1): SELU(inplace=True)\n",
        "          (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "          (3): Dropout(p=0.2, inplace=False)\n",
+       "          (3): Dropout(p=0.1, inplace=False)\n",
        "          (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (5): SELU(inplace=True)\n",
        "          (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
        "        )\n",
        "      )\n",
        "    )\n",
-       "    (2): Sequential(\n",
+       "    (1): Sequential(\n",
        "      (0): WideBlock(\n",
        "        (activation): SELU(inplace=True)\n",
        "        (blocks): Sequential(\n",
        "          (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (1): SELU(inplace=True)\n",
        "          (2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "          (3): Dropout(p=0.2, inplace=False)\n",
+       "          (3): Dropout(p=0.1, inplace=False)\n",
        "          (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (5): SELU(inplace=True)\n",
        "          (6): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
@@ -263,14 +581,14 @@
        "        )\n",
        "      )\n",
        "    )\n",
-       "    (3): Sequential(\n",
+       "    (2): Sequential(\n",
        "      (0): WideBlock(\n",
        "        (activation): SELU(inplace=True)\n",
        "        (blocks): Sequential(\n",
        "          (0): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (1): SELU(inplace=True)\n",
        "          (2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "          (3): Dropout(p=0.2, inplace=False)\n",
+       "          (3): Dropout(p=0.1, inplace=False)\n",
        "          (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (5): SELU(inplace=True)\n",
        "          (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
@@ -280,34 +598,28 @@
        "        )\n",
        "      )\n",
        "    )\n",
-       "    (4): Sequential(\n",
+       "    (3): Sequential(\n",
        "      (0): WideBlock(\n",
        "        (activation): SELU(inplace=True)\n",
        "        (blocks): Sequential(\n",
        "          (0): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (1): SELU(inplace=True)\n",
-       "          (2): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
-       "          (3): Dropout(p=0.2, inplace=False)\n",
-       "          (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "          (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+       "          (3): Dropout(p=0.1, inplace=False)\n",
+       "          (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
        "          (5): SELU(inplace=True)\n",
-       "          (6): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+       "          (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
        "        )\n",
        "        (shortcut): Sequential(\n",
-       "          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+       "          (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
        "        )\n",
        "      )\n",
        "    )\n",
        "  )\n",
-       "  (2): Sequential(\n",
-       "    (0): BatchNorm2d(512, eps=1e-05, momentum=0.8, affine=True, track_running_stats=True)\n",
-       "    (1): SELU(inplace=True)\n",
-       "    (2): Reduce('b c h w -> b c', 'mean')\n",
-       "    (3): Linear(in_features=512, out_features=80, bias=True)\n",
-       "  )\n",
        ")"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -318,7 +630,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -328,36 +640,32 @@
       "==========================================================================================\n",
       "Layer (type:depth-idx)                   Output Shape              Param #\n",
       "==========================================================================================\n",
-      "├─SELU: 1-1                              [-1, 1, 28, 952]          --\n",
-      "├─Sequential: 1                          []                        --\n",
-      "|    └─SELU: 2-1                         [-1, 1, 28, 952]          --\n",
-      "├─Sequential: 1-2                        [-1, 512, 4, 119]         --\n",
-      "|    └─Conv2d: 2-2                       [-1, 64, 28, 952]         576\n",
-      "|    └─Sequential: 2-3                   [-1, 64, 28, 952]         --\n",
-      "|    |    └─WideBlock: 3-1               [-1, 64, 28, 952]         73,984\n",
-      "|    └─Sequential: 2-4                   [-1, 128, 14, 476]        --\n",
-      "|    |    └─WideBlock: 3-2               [-1, 128, 14, 476]        229,760\n",
-      "|    └─Sequential: 2-5                   [-1, 256, 7, 238]         --\n",
-      "|    |    └─WideBlock: 3-3               [-1, 256, 7, 238]         918,272\n",
-      "|    └─Sequential: 2-6                   [-1, 512, 4, 119]         --\n",
-      "|    |    └─WideBlock: 3-4               [-1, 512, 4, 119]         3,671,552\n",
-      "├─Sequential: 1-3                        [-1, 80]                  --\n",
-      "|    └─BatchNorm2d: 2-7                  [-1, 512, 4, 119]         1,024\n",
-      "├─SELU: 1-4                              [-1, 512, 4, 119]         --\n",
+      "├─Sequential: 1-1                        [-1, 64, 7, 237]          --\n",
+      "|    └─Conv2d: 2-1                       [-1, 64, 12, 474]         3,136\n",
+      "|    └─BatchNorm2d: 2-2                  [-1, 64, 12, 474]         128\n",
+      "├─SELU: 1-2                              [-1, 64, 12, 474]         --\n",
       "├─Sequential: 1                          []                        --\n",
-      "|    └─SELU: 2-8                         [-1, 512, 4, 119]         --\n",
-      "|    └─Reduce: 2-9                       [-1, 512]                 --\n",
-      "|    └─Linear: 2-10                      [-1, 80]                  41,040\n",
+      "|    └─SELU: 2-3                         [-1, 64, 12, 474]         --\n",
+      "|    └─MaxPool2d: 2-4                    [-1, 64, 7, 237]          --\n",
+      "├─Sequential: 1-3                        [-1, 256, 1, 30]          --\n",
+      "|    └─Sequential: 2-5                   [-1, 64, 7, 237]          --\n",
+      "|    |    └─WideBlock: 3-1               [-1, 64, 7, 237]          73,984\n",
+      "|    └─Sequential: 2-6                   [-1, 128, 4, 119]         --\n",
+      "|    |    └─WideBlock: 3-2               [-1, 128, 4, 119]         229,760\n",
+      "|    └─Sequential: 2-7                   [-1, 256, 2, 60]          --\n",
+      "|    |    └─WideBlock: 3-3               [-1, 256, 2, 60]          918,272\n",
+      "|    └─Sequential: 2-8                   [-1, 256, 1, 30]          --\n",
+      "|    |    └─WideBlock: 3-4               [-1, 256, 1, 30]          1,246,208\n",
       "==========================================================================================\n",
-      "Total params: 4,936,208\n",
-      "Trainable params: 4,936,208\n",
+      "Total params: 2,471,488\n",
+      "Trainable params: 2,471,488\n",
       "Non-trainable params: 0\n",
-      "Total mult-adds (M): 35.01\n",
+      "Total mult-adds (M): 27.71\n",
       "==========================================================================================\n",
       "Input size (MB): 0.10\n",
-      "Forward/backward pass size (MB): 14.88\n",
-      "Params size (MB): 18.83\n",
-      "Estimated Total Size (MB): 33.81\n",
+      "Forward/backward pass size (MB): 5.55\n",
+      "Params size (MB): 9.43\n",
+      "Estimated Total Size (MB): 15.08\n",
       "==========================================================================================\n"
      ]
     },
@@ -367,51 +675,47 @@
        "==========================================================================================\n",
        "Layer (type:depth-idx)                   Output Shape              Param #\n",
        "==========================================================================================\n",
-       "├─SELU: 1-1                              [-1, 1, 28, 952]          --\n",
+       "├─Sequential: 1-1                        [-1, 64, 7, 237]          --\n",
+       "|    └─Conv2d: 2-1                       [-1, 64, 12, 474]         3,136\n",
+       "|    └─BatchNorm2d: 2-2                  [-1, 64, 12, 474]         128\n",
+       "├─SELU: 1-2                              [-1, 64, 12, 474]         --\n",
        "├─Sequential: 1                          []                        --\n",
-       "|    └─SELU: 2-1                         [-1, 1, 28, 952]          --\n",
-       "├─Sequential: 1-2                        [-1, 512, 4, 119]         --\n",
-       "|    └─Conv2d: 2-2                       [-1, 64, 28, 952]         576\n",
-       "|    └─Sequential: 2-3                   [-1, 64, 28, 952]         --\n",
-       "|    |    └─WideBlock: 3-1               [-1, 64, 28, 952]         73,984\n",
-       "|    └─Sequential: 2-4                   [-1, 128, 14, 476]        --\n",
-       "|    |    └─WideBlock: 3-2               [-1, 128, 14, 476]        229,760\n",
-       "|    └─Sequential: 2-5                   [-1, 256, 7, 238]         --\n",
-       "|    |    └─WideBlock: 3-3               [-1, 256, 7, 238]         918,272\n",
-       "|    └─Sequential: 2-6                   [-1, 512, 4, 119]         --\n",
-       "|    |    └─WideBlock: 3-4               [-1, 512, 4, 119]         3,671,552\n",
-       "├─Sequential: 1-3                        [-1, 80]                  --\n",
-       "|    └─BatchNorm2d: 2-7                  [-1, 512, 4, 119]         1,024\n",
-       "├─SELU: 1-4                              [-1, 512, 4, 119]         --\n",
-       "├─Sequential: 1                          []                        --\n",
-       "|    └─SELU: 2-8                         [-1, 512, 4, 119]         --\n",
-       "|    └─Reduce: 2-9                       [-1, 512]                 --\n",
-       "|    └─Linear: 2-10                      [-1, 80]                  41,040\n",
+       "|    └─SELU: 2-3                         [-1, 64, 12, 474]         --\n",
+       "|    └─MaxPool2d: 2-4                    [-1, 64, 7, 237]          --\n",
+       "├─Sequential: 1-3                        [-1, 256, 1, 30]          --\n",
+       "|    └─Sequential: 2-5                   [-1, 64, 7, 237]          --\n",
+       "|    |    └─WideBlock: 3-1               [-1, 64, 7, 237]          73,984\n",
+       "|    └─Sequential: 2-6                   [-1, 128, 4, 119]         --\n",
+       "|    |    └─WideBlock: 3-2               [-1, 128, 4, 119]         229,760\n",
+       "|    └─Sequential: 2-7                   [-1, 256, 2, 60]          --\n",
+       "|    |    └─WideBlock: 3-3               [-1, 256, 2, 60]          918,272\n",
+       "|    └─Sequential: 2-8                   [-1, 256, 1, 30]          --\n",
+       "|    |    └─WideBlock: 3-4               [-1, 256, 1, 30]          1,246,208\n",
        "==========================================================================================\n",
-       "Total params: 4,936,208\n",
-       "Trainable params: 4,936,208\n",
+       "Total params: 2,471,488\n",
+       "Trainable params: 2,471,488\n",
        "Non-trainable params: 0\n",
-       "Total mult-adds (M): 35.01\n",
+       "Total mult-adds (M): 27.71\n",
        "==========================================================================================\n",
        "Input size (MB): 0.10\n",
-       "Forward/backward pass size (MB): 14.88\n",
-       "Params size (MB): 18.83\n",
-       "Estimated Total Size (MB): 33.81\n",
+       "Forward/backward pass size (MB): 5.55\n",
+       "Params size (MB): 9.43\n",
+       "Estimated Total Size (MB): 15.08\n",
        "=========================================================================================="
       ]
      },
-     "execution_count": 26,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "summary(backbone, (1, 28, 952), device=\"cpu\", depth=3)"
+    "summary(wr, (1, 28, 952), device=\"cpu\", depth=3)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1131,16 +1435,89 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
-    "pred = torch.Tensor([1,1,1,1,1, 81, 1, 79, 79, 79, 2,1,1,1,1, 81, 1, 79, 79, 79, 1,1,1,1,1, 81, 79, 79, 79, 79]).long()\n",
+    "pred = torch.Tensor([1,21,2,45,31, 81, 1, 79, 79, 79, 2,1,1,1,1, 81, 1, 79, 79, 79, 1,1,1,1,1, 81, 79, 79, 79, 79]).long()\n",
     "target = torch.Tensor([1,1,1,1,1, 81, 79, 79, 79, 79, 1,1,1,1,1, 81, 79, 79, 79, 79, 1,1,1,1,1, 81, 79, 79, 79, 79]).long()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mask = (target != 79)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([ True,  True,  True,  True,  True,  True, False, False, False, False,\n",
+       "         True,  True,  True,  True,  True,  True, False, False, False, False,\n",
+       "         True,  True,  True,  True,  True,  True, False, False, False, False])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([ 1, 21,  2, 45, 31, 81,  0,  0,  0,  0,  2,  1,  1,  1,  1, 81,  0,  0,\n",
+       "         0,  0,  1,  1,  1,  1,  1, 81,  0,  0,  0,  0])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pred * mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([ 1,  1,  1,  1,  1, 81,  0,  0,  0,  0,  1,  1,  1,  1,  1, 81,  0,  0,\n",
+       "         0,  0,  1,  1,  1,  1,  1, 81,  0,  0,  0,  0])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target * mask"
+   ]
+  },
+  {
+   "cell_type": "code",
    "execution_count": 32,
    "metadata": {},
    "outputs": [],