4 files changed, 407 insertions, 416 deletions
diff --git a/notebooks/04-convnext.ipynb b/notebooks/04-convnext.ipynb
index 5ab71c8..5affe85 100644
--- a/notebooks/04-convnext.ipynb
+++ b/notebooks/04-convnext.ipynb
@@ -2,19 +2,12 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
    "id": "7c02ae76-b540-4b16-9492-e9210b3b9249",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "import os\n",
     "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
@@ -37,9 +30,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 2,
    "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from hydra import compose, initialize\n",
@@ -49,9 +44,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 3,
    "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "path = \"../training/conf/network/convnext.yaml\""
@@ -59,9 +56,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 64,
    "id": "e52ecb01-c975-4e55-925d-1182c7aea473",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "with open(path, \"rb\") as f:\n",
@@ -70,17 +69,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 65,
    "id": "f939aa37-7b1d-45cc-885c-323c4540bda1",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'_target_': 'text_recognizer.network.convnext.ConvNext', 'dim': 16, 'dim_mults': [2, 4, 8], 'depths': [3, 3, 6], 'downsampling_factors': [[2, 2], [2, 2], [2, 2]], 'attn': {'_target_': 'text_recognizer.network.convnext.TransformerBlock', 'attn': {'_target_': 'text_recognizer.network.convnext.Attention', 'dim': 128, 'heads': 4, 'dim_head': 64, 'scale': 8}, 'ff': {'_target_': 'text_recognizer.network.convnext.FeedForward', 'dim': 128, 'mult': 4}}}"
+       "{'_target_': 'text_recognizer.network.convnext.convnext.ConvNext', 'dim': 8, 'dim_mults': [2, 8], 'depths': [2, 2], 'attn': {'_target_': 'text_recognizer.network.convnext.transformer.Transformer', 'attn': {'_target_': 'text_recognizer.network.convnext.transformer.Attention', 'dim': 64, 'heads': 4, 'dim_head': 64, 'scale': 8}, 'ff': {'_target_': 'text_recognizer.network.convnext.transformer.FeedForward', 'dim': 64, 'mult': 4}}}"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 65,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -91,21 +92,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "a2b420c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cfg.dim_mults = [2, 4, 8, 8]\n",
-    "cfg.depths = [3, 3, 6, 6]\n",
-    "cfg.downsampling_factors = [[2, 2], [2, 2], [2, 2], [2, 1]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 66,
    "id": "c9589350",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "net = instantiate(cfg)"
@@ -113,9 +104,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 67,
    "id": "618b997c-e6a6-4487-b70c-9d260cb556d3",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from torchinfo import summary"
@@ -123,11 +116,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 68,
    "id": "25759b7b-8deb-4163-b75d-a1357c9fe88f",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -135,84 +126,31 @@
        "====================================================================================================\n",
        "Layer (type:depth-idx)                             Output Shape              Param #\n",
        "====================================================================================================\n",
-       "ConvNext                                           [2, 128, 72, 80]          165,408\n",
-       "├─TransformerBlock: 1-5                            [2, 128, 72, 80]          (recursive)\n",
-       "│    └─Attention: 2-6                              [2, 128, 72, 80]          (recursive)\n",
-       "│    │    └─LayerNorm: 3-13                        [2, 128, 72, 80]          (recursive)\n",
-       "├─Conv2d: 1-3                                      [2, 16, 576, 640]         (recursive)\n",
-       "├─TransformerBlock: 1                              --                        --\n",
-       "│    └─Attention: 2                                --                        --\n",
-       "│    │    └─Conv2d: 3-15                           [2, 128, 72, 80]          (recursive)\n",
-       "│    └─FeedForward: 2-7                            [2, 128, 72, 80]          (recursive)\n",
-       "│    │    └─Residual: 3-16                         [2, 128, 72, 80]          (recursive)\n",
-       "│    │    │    └─Sequential: 4-26                  [2, 128, 72, 80]          (recursive)\n",
-       "├─Conv2d: 1-3                                      [2, 16, 576, 640]         (recursive)\n",
-       "├─ModuleList: 1-4                                  --                        --\n",
-       "│    └─ModuleList: 2-3                             --                        --\n",
-       "│    │    └─ConvNextBlock: 3-4                     [2, 16, 576, 640]         --\n",
-       "│    │    │    └─Conv2d: 4-2                       [2, 16, 576, 640]         800\n",
-       "│    │    │    └─Sequential: 4-3                   [2, 16, 576, 640]         9,280\n",
-       "│    │    │    └─Identity: 4-4                     [2, 16, 576, 640]         --\n",
-       "│    │    └─ModuleList: 3-5                        --                        --\n",
-       "│    │    │    └─ConvNextBlock: 4-5                [2, 16, 576, 640]         10,080\n",
-       "│    │    │    └─ConvNextBlock: 4-6                [2, 16, 576, 640]         10,080\n",
-       "│    │    │    └─ConvNextBlock: 4-7                [2, 16, 576, 640]         10,080\n",
-       "│    │    └─Downsample: 3-6                        [2, 32, 288, 320]         --\n",
-       "│    │    │    └─Sequential: 4-8                   [2, 32, 288, 320]         2,080\n",
-       "│    └─ModuleList: 2-4                             --                        --\n",
-       "│    │    └─ConvNextBlock: 3-7                     [2, 32, 288, 320]         --\n",
-       "│    │    │    └─Conv2d: 4-9                       [2, 32, 288, 320]         1,600\n",
-       "│    │    │    └─Sequential: 4-10                  [2, 32, 288, 320]         36,992\n",
-       "│    │    │    └─Identity: 4-11                    [2, 32, 288, 320]         --\n",
-       "│    │    └─ModuleList: 3-8                        --                        --\n",
-       "│    │    │    └─ConvNextBlock: 4-12               [2, 32, 288, 320]         38,592\n",
-       "│    │    │    └─ConvNextBlock: 4-13               [2, 32, 288, 320]         38,592\n",
-       "│    │    │    └─ConvNextBlock: 4-14               [2, 32, 288, 320]         38,592\n",
-       "│    │    └─Downsample: 3-9                        [2, 64, 144, 160]         --\n",
-       "│    │    │    └─Sequential: 4-15                  [2, 64, 144, 160]         8,256\n",
-       "│    └─ModuleList: 2-5                             --                        --\n",
-       "│    │    └─ConvNextBlock: 3-10                    [2, 64, 144, 160]         --\n",
-       "│    │    │    └─Conv2d: 4-16                      [2, 64, 144, 160]         3,200\n",
-       "│    │    │    └─Sequential: 4-17                  [2, 64, 144, 160]         147,712\n",
-       "│    │    │    └─Identity: 4-18                    [2, 64, 144, 160]         --\n",
-       "│    │    └─ModuleList: 3-11                       --                        --\n",
-       "│    │    │    └─ConvNextBlock: 4-19               [2, 64, 144, 160]         150,912\n",
-       "│    │    │    └─ConvNextBlock: 4-20               [2, 64, 144, 160]         150,912\n",
-       "│    │    │    └─ConvNextBlock: 4-21               [2, 64, 144, 160]         150,912\n",
-       "│    │    │    └─ConvNextBlock: 4-22               [2, 64, 144, 160]         150,912\n",
-       "│    │    │    └─ConvNextBlock: 4-23               [2, 64, 144, 160]         150,912\n",
-       "│    │    │    └─ConvNextBlock: 4-24               [2, 64, 144, 160]         150,912\n",
-       "│    │    └─Downsample: 3-12                       [2, 128, 72, 80]          --\n",
-       "│    │    │    └─Sequential: 4-25                  [2, 128, 72, 80]          32,896\n",
-       "├─TransformerBlock: 1-5                            [2, 128, 72, 80]          (recursive)\n",
-       "│    └─Attention: 2-6                              [2, 128, 72, 80]          (recursive)\n",
-       "│    │    └─LayerNorm: 3-13                        [2, 128, 72, 80]          (recursive)\n",
-       "│    │    └─Conv2d: 3-14                           [2, 768, 72, 80]          98,304\n",
-       "│    │    └─Conv2d: 3-15                           [2, 128, 72, 80]          (recursive)\n",
-       "│    └─FeedForward: 2-7                            [2, 128, 72, 80]          (recursive)\n",
-       "│    │    └─Residual: 3-16                         [2, 128, 72, 80]          (recursive)\n",
-       "│    │    │    └─Sequential: 4-26                  [2, 128, 72, 80]          (recursive)\n",
-       "├─LayerNorm: 1-6                                   [2, 128, 72, 80]          128\n",
+       "ConvNext                                           [2, 64, 14, 256]          --\n",
+       "├─Conv2d: 1-1                                      [2, 8, 56, 1024]          400\n",
+       "├─ModuleList: 1-2                                  --                        30,512\n",
+       "├─Transformer: 1-3                                 [2, 64, 14, 256]          98,688\n",
+       "├─LayerNorm: 1-4                                   [2, 64, 14, 256]          64\n",
        "====================================================================================================\n",
-       "Total params: 1,558,144\n",
-       "Trainable params: 1,558,144\n",
+       "Total params: 129,664\n",
+       "Trainable params: 129,664\n",
        "Non-trainable params: 0\n",
-       "Total mult-adds (G): 114.00\n",
+       "Total mult-adds (G): 2.00\n",
        "====================================================================================================\n",
-       "Input size (MB): 2.95\n",
-       "Forward/backward pass size (MB): 3822.06\n",
-       "Params size (MB): 5.57\n",
-       "Estimated Total Size (MB): 3830.58\n",
+       "Input size (MB): 0.46\n",
+       "Forward/backward pass size (MB): 260.57\n",
+       "Params size (MB): 0.52\n",
+       "Estimated Total Size (MB): 261.55\n",
        "===================================================================================================="
       ]
      },
-     "execution_count": 41,
+     "execution_count": 68,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "summary(net, (2, 1, 576, 640), device=\"cpu\", depth=4)"
+    "summary(net, (2, 1, 56, 1024), device=\"cpu\", depth=1)"
    ]
   },
   {
diff --git a/notebooks/04-mammut-lines.ipynb b/notebooks/04-mammut-lines.ipynb
new file mode 100644
index 0000000..b0690f7
--- /dev/null
+++ b/notebooks/04-mammut-lines.ipynb
@@ -0,0 +1,251 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7c02ae76-b540-4b16-9492-e9210b3b9249",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
+    "import random\n",
+    "\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import numpy as np\n",
+    "from omegaconf import OmegaConf\n",
+    "import torch\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from importlib.util import find_spec\n",
+    "if find_spec(\"text_recognizer\") is None:\n",
+    "    import sys\n",
+    "    sys.path.append('..')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from hydra import compose, initialize\n",
+    "from omegaconf import OmegaConf\n",
+    "from hydra.utils import instantiate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"../training/conf/network/mammut_lines.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "e52ecb01-c975-4e55-925d-1182c7aea473",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(path, \"rb\") as f:\n",
+    "    cfg = OmegaConf.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "f939aa37-7b1d-45cc-885c-323c4540bda1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'_target_': 'text_recognizer.network.mammut.MaMMUT', 'encoder': {'_target_': 'text_recognizer.network.vit.Vit', 'image_height': 56, 'image_width': 1024, 'patch_height': 56, 'patch_width': 8, 'dim': 512, 'encoder': {'_target_': 'text_recognizer.network.transformer.encoder.Encoder', 'dim': 512, 'heads': 12, 'dim_head': 64, 'ff_mult': 4, 'depth': 4, 'dropout_rate': 0.1}, 'channels': 1}, 'image_attn_pool': {'_target_': 'text_recognizer.network.transformer.attention.Attention', 'dim': 512, 'heads': 8, 'causal': False, 'dim_head': 64, 'ff_mult': 4, 'dropout_rate': 0.0, 'use_flash': True, 'norm_context': True, 'rotary_emb': None}, 'decoder': {'_target_': 'text_recognizer.network.transformer.decoder.Decoder', 'dim': 512, 'ff_mult': 4, 'heads': 12, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.1}, 'dim': 512, 'dim_latent': 512, 'num_tokens': 58, 'pad_index': 3, 'num_image_queries': 256}"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cfg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net = instantiate(cfg)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "618b997c-e6a6-4487-b70c-9d260cb556d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torchinfo import summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "7daf1f49",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "====================================================================================================\n",
+       "Layer (type:depth-idx)                             Output Shape              Param #\n",
+       "====================================================================================================\n",
+       "MaMMUT                                             [1, 89, 58]               627,712\n",
+       "├─Vit: 1-1                                         [1, 128, 512]             --\n",
+       "│    └─Sequential: 2-1                             [1, 128, 512]             --\n",
+       "│    │    └─Rearrange: 3-1                         [1, 128, 448]             --\n",
+       "│    │    └─LayerNorm: 3-2                         [1, 128, 448]             896\n",
+       "│    │    └─Linear: 3-3                            [1, 128, 512]             229,888\n",
+       "│    │    └─LayerNorm: 3-4                         [1, 128, 512]             1,024\n",
+       "│    └─Encoder: 2-2                                [1, 128, 512]             --\n",
+       "│    │    └─ModuleList: 3-5                        --                        --\n",
+       "│    │    │    └─Attention: 4-1                    [1, 128, 512]             4,724,224\n",
+       "│    │    │    └─Attention: 4-2                    [1, 128, 512]             4,724,224\n",
+       "│    │    │    └─Attention: 4-3                    [1, 128, 512]             4,724,224\n",
+       "│    │    │    └─Attention: 4-4                    [1, 128, 512]             4,724,224\n",
+       "│    │    └─LayerNorm: 3-6                         [1, 128, 512]             1,024\n",
+       "├─Attention: 1-2                                   [1, 257, 512]             --\n",
+       "│    └─LayerNorm: 2-3                              [1, 257, 512]             1,024\n",
+       "│    └─Linear: 2-4                                 [1, 257, 512]             262,144\n",
+       "│    └─LayerNorm: 2-5                              [1, 128, 512]             1,024\n",
+       "│    └─Linear: 2-6                                 [1, 128, 1024]            524,288\n",
+       "│    └─Attend: 2-7                                 [1, 8, 257, 64]           --\n",
+       "│    └─Linear: 2-8                                 [1, 257, 512]             262,144\n",
+       "│    └─Sequential: 2-9                             [1, 257, 512]             --\n",
+       "│    │    └─Linear: 3-7                            [1, 257, 4096]            2,101,248\n",
+       "│    │    └─SwiGLU: 3-8                            [1, 257, 2048]            --\n",
+       "│    │    └─Linear: 3-9                            [1, 257, 512]             1,049,088\n",
+       "├─LayerNorm: 1-3                                   [1, 257, 512]             1,024\n",
+       "├─Embedding: 1-4                                   [1, 89, 512]              29,696\n",
+       "├─Decoder: 1-5                                     [1, 89, 512]              --\n",
+       "│    └─ModuleList: 2-10                            --                        --\n",
+       "│    │    └─ModuleList: 3-10                       --                        --\n",
+       "│    │    │    └─Attention: 4-5                    [1, 89, 512]              4,724,224\n",
+       "│    │    │    └─Attention: 4-6                    [1, 89, 512]              4,724,224\n",
+       "│    │    └─ModuleList: 3-11                       --                        --\n",
+       "│    │    │    └─Attention: 4-7                    [1, 89, 512]              4,724,224\n",
+       "│    │    │    └─Attention: 4-8                    [1, 89, 512]              4,724,224\n",
+       "│    │    └─ModuleList: 3-12                       --                        --\n",
+       "│    │    │    └─Attention: 4-9                    [1, 89, 512]              4,724,224\n",
+       "│    │    │    └─Attention: 4-10                   [1, 89, 512]              4,724,224\n",
+       "│    │    └─ModuleList: 3-13                       --                        --\n",
+       "│    │    │    └─Attention: 4-11                   [1, 89, 512]              4,724,224\n",
+       "│    │    │    └─Attention: 4-12                   [1, 89, 512]              4,724,224\n",
+       "│    │    └─ModuleList: 3-14                       --                        --\n",
+       "│    │    │    └─Attention: 4-13                   [1, 89, 512]              4,724,224\n",
+       "│    │    │    └─Attention: 4-14                   [1, 89, 512]              4,724,224\n",
+       "│    │    └─ModuleList: 3-15                       --                        --\n",
+       "│    │    │    └─Attention: 4-15                   [1, 89, 512]              4,724,224\n",
+       "│    │    │    └─Attention: 4-16                   [1, 89, 512]              4,724,224\n",
+       "│    └─LayerNorm: 2-11                             [1, 89, 512]              1,024\n",
+       "├─Sequential: 1-6                                  [1, 89, 58]               --\n",
+       "│    └─LayerNorm: 2-12                             [1, 89, 512]              1,024\n",
+       "│    └─Linear: 2-13                                [1, 89, 58]               29,696\n",
+       "====================================================================================================\n",
+       "Total params: 80,711,552\n",
+       "Trainable params: 80,711,552\n",
+       "Non-trainable params: 0\n",
+       "Total mult-adds (M): 80.08\n",
+       "====================================================================================================\n",
+       "Input size (MB): 0.23\n",
+       "Forward/backward pass size (MB): 131.05\n",
+       "Params size (MB): 320.34\n",
+       "Estimated Total Size (MB): 451.61\n",
+       "===================================================================================================="
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "166bf656-aba6-4654-a530-dfce12666297",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = net(torch.randn(1, 1, 56, 1024), torch.randint(1, 4, (1, 4)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "43d9af25-9872-497d-8796-4835a65262ed",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 4, 58])"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63ac7f1b-0eb1-4625-96b8-467846eb7ae6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/04-vit-lines.ipynb b/notebooks/04-vit-lines.ipynb
deleted file mode 100644
index b87f38c..0000000
--- a/notebooks/04-vit-lines.ipynb
+++ /dev/null
@@ -1,305 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "7c02ae76-b540-4b16-9492-e9210b3b9249",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
-    "import random\n",
-    "\n",
-    "%matplotlib inline\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "import numpy as np\n",
-    "from omegaconf import OmegaConf\n",
-    "import torch\n",
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "\n",
-    "from importlib.util import find_spec\n",
-    "if find_spec(\"text_recognizer\") is None:\n",
-    "    import sys\n",
-    "    sys.path.append('..')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from hydra import compose, initialize\n",
-    "from omegaconf import OmegaConf\n",
-    "from hydra.utils import instantiate"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "path = \"../training/conf/network/vit_lines.yaml\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "e52ecb01-c975-4e55-925d-1182c7aea473",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open(path, \"rb\") as f:\n",
-    "    cfg = OmegaConf.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "f939aa37-7b1d-45cc-885c-323c4540bda1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'_target_': 'text_recognizer.network.vit.VisionTransformer', 'image_height': 56, 'image_width': 1024, 'patch_height': 28, 'patch_width': 32, 'dim': 256, 'num_classes': 57, 'encoder': {'_target_': 'text_recognizer.network.transformer.encoder.Encoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'decoder': {'_target_': 'text_recognizer.network.transformer.decoder.Decoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'token_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.token.TokenEmbedding', 'num_tokens': 57, 'dim': 256, 'use_l2': True}, 'pos_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.absolute.AbsolutePositionalEmbedding', 'dim': 256, 'max_length': 89, 'use_l2': True}, 'tie_embeddings': True, 'pad_index': 3}"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "cfg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "net = instantiate(cfg)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "618b997c-e6a6-4487-b70c-9d260cb556d3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from torchinfo import summary"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "7daf1f49",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "====================================================================================================\n",
-       "Layer (type:depth-idx)                             Output Shape              Param #\n",
-       "====================================================================================================\n",
-       "VisionTransformer                                  [1, 57, 89]               --\n",
-       "├─Sequential: 1-1                                  [1, 64, 256]              --\n",
-       "│    └─Rearrange: 2-1                              [1, 64, 896]              --\n",
-       "│    └─LayerNorm: 2-2                              [1, 64, 896]              1,792\n",
-       "│    └─Linear: 2-3                                 [1, 64, 256]              229,632\n",
-       "│    └─LayerNorm: 2-4                              [1, 64, 256]              512\n",
-       "├─Encoder: 1-2                                     [1, 64, 256]              --\n",
-       "│    └─ModuleList: 2-5                             --                        --\n",
-       "│    │    └─ModuleList: 3-1                        --                        --\n",
-       "│    │    │    └─Attention: 4-1                    [1, 64, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-2                  [1, 64, 256]              526,080\n",
-       "│    │    └─ModuleList: 3-2                        --                        --\n",
-       "│    │    │    └─Attention: 4-3                    [1, 64, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-4                  [1, 64, 256]              526,080\n",
-       "│    │    └─ModuleList: 3-3                        --                        --\n",
-       "│    │    │    └─Attention: 4-5                    [1, 64, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-6                  [1, 64, 256]              526,080\n",
-       "│    │    └─ModuleList: 3-4                        --                        --\n",
-       "│    │    │    └─Attention: 4-7                    [1, 64, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-8                  [1, 64, 256]              526,080\n",
-       "│    │    └─ModuleList: 3-5                        --                        --\n",
-       "│    │    │    └─Attention: 4-9                    [1, 64, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-10                 [1, 64, 256]              526,080\n",
-       "│    │    └─ModuleList: 3-6                        --                        --\n",
-       "│    │    │    └─Attention: 4-11                   [1, 64, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-12                 [1, 64, 256]              526,080\n",
-       "│    └─LayerNorm: 2-6                              [1, 64, 256]              512\n",
-       "├─TokenEmbedding: 1-3                              [1, 89, 256]              --\n",
-       "│    └─Embedding: 2-7                              [1, 89, 256]              14,592\n",
-       "├─AbsolutePositionalEmbedding: 1-4                 [89, 256]                 --\n",
-       "│    └─Embedding: 2-8                              [89, 256]                 22,784\n",
-       "├─Decoder: 1-5                                     [1, 89, 256]              --\n",
-       "│    └─ModuleList: 2-9                             --                        --\n",
-       "│    │    └─ModuleList: 3-7                        --                        --\n",
-       "│    │    │    └─Attention: 4-13                   [1, 89, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-14                 [1, 89, 256]              526,080\n",
-       "│    │    │    └─Attention: 4-15                   [1, 89, 256]              525,824\n",
-       "│    │    └─ModuleList: 3-8                        --                        --\n",
-       "│    │    │    └─Attention: 4-16                   [1, 89, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-17                 [1, 89, 256]              526,080\n",
-       "│    │    │    └─Attention: 4-18                   [1, 89, 256]              525,824\n",
-       "│    │    └─ModuleList: 3-9                        --                        --\n",
-       "│    │    │    └─Attention: 4-19                   [1, 89, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-20                 [1, 89, 256]              526,080\n",
-       "│    │    │    └─Attention: 4-21                   [1, 89, 256]              525,824\n",
-       "│    │    └─ModuleList: 3-10                       --                        --\n",
-       "│    │    │    └─Attention: 4-22                   [1, 89, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-23                 [1, 89, 256]              526,080\n",
-       "│    │    │    └─Attention: 4-24                   [1, 89, 256]              525,824\n",
-       "│    │    └─ModuleList: 3-11                       --                        --\n",
-       "│    │    │    └─Attention: 4-25                   [1, 89, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-26                 [1, 89, 256]              526,080\n",
-       "│    │    │    └─Attention: 4-27                   [1, 89, 256]              525,824\n",
-       "│    │    └─ModuleList: 3-12                       --                        --\n",
-       "│    │    │    └─Attention: 4-28                   [1, 89, 256]              525,824\n",
-       "│    │    │    └─FeedForward: 4-29                 [1, 89, 256]              526,080\n",
-       "│    │    │    └─Attention: 4-30                   [1, 89, 256]              525,824\n",
-       "│    └─LayerNorm: 2-10                             [1, 89, 256]              512\n",
-       "====================================================================================================\n",
-       "Total params: 16,048,128\n",
-       "Trainable params: 16,048,128\n",
-       "Non-trainable params: 0\n",
-       "Total mult-adds (M): 18.03\n",
-       "====================================================================================================\n",
-       "Input size (MB): 0.23\n",
-       "Forward/backward pass size (MB): 46.52\n",
-       "Params size (MB): 64.16\n",
-       "Estimated Total Size (MB): 110.91\n",
-       "===================================================================================================="
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "1b1a8ac0-bd05-4076-90c2-2de6b740490d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "248a0cb1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t = net(torch.randn(1, 1, 56, 1024), torch.randint(1, 4, (1, 4)))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "c251a954-00ac-4680-87e4-f27b6ce06023",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([1, 58, 4])"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "t.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "02d82c5e-4e67-4f87-a539-393e4cf59b6e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "loss = torch.nn.CrossEntropyLoss()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "dc836993-a5d8-43b2-b41c-158a17990075",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor(4.0604, grad_fn=<NllLoss2DBackward0>)"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "loss(t.permute(0, 2, 1), torch.randint(0, 58, (1, 89)))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "166bf656-aba6-4654-a530-dfce12666297",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/Untitled1.ipynb b/notebooks/Untitled1.ipynb
index 92b35c9..d996e98 100644
--- a/notebooks/Untitled1.ipynb
+++ b/notebooks/Untitled1.ipynb
@@ -2,12 +2,24 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 14,
    "id": "a15a452c-bbbc-4227-90fb-ad573f82c43f",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'text_recognizer.network.transformer.embeddings'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[14], line 15\u001b[0m\n\u001b[1;32m     12\u001b[0m     sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext_recognizer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miam_lines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IAMLines\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext_recognizer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnetwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtransformer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01membeddings\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msincos\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m sincos_2d\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'text_recognizer.network.transformer.embeddings'"
+     ]
+    }
+   ],
    "source": [
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
@@ -430,6 +442,101 @@
     "plt.figure(figsize=(40, 20))\n",
     "plt.imshow(xxx, cmap='gray')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "4535cb1c-fc97-4393-8348-649d06030931",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "conv = torch.nn.Conv2d(1, 1, kernel_size=3, stride=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "d3092482-a2d5-4b52-9967-8d18cab35f42",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 1, 13, 255])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = torch.randn(1, 1, 56, 1024)\n",
+    "conv(conv(x)).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "44183fe9-f798-4755-8db5-491b8a3dbf2f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[True, True, True],\n",
+      "        [True, True, True]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "source = torch.randn((2,3)).bool()\n",
+    "print(source)\n",
+    "# now we expand to size (7, 11) by appending a row of 0s at pos 0 and pos 6, \n",
+    "# and a column of 0s at pos 10\n",
+    "result = F.pad(source, pad=(0, 1), value=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "34220755-4224-458e-94b8-ea1fa0488a65",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ True,  True,  True, False],\n",
+       "        [ True,  True,  True, False]])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfc269b8-8242-41ee-9190-73df0af47e83",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {