{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7c02ae76-b540-4b16-9492-e9210b3b9249",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
    "import random\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import numpy as np\n",
    "from omegaconf import OmegaConf\n",
    "import torch\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "from importlib.util import find_spec\n",
    "if find_spec(\"text_recognizer\") is None:\n",
    "    import sys\n",
    "    sys.path.append('..')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from hydra import compose, initialize\n",
    "from omegaconf import OmegaConf\n",
    "from hydra.utils import instantiate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../training/conf/network/vit_lines.yaml\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "e52ecb01-c975-4e55-925d-1182c7aea473",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path, \"rb\") as f:\n",
    "    cfg = OmegaConf.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "f939aa37-7b1d-45cc-885c-323c4540bda1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'_target_': 'text_recognizer.network.vit.VisionTransformer', 'image_height': 56, 'image_width': 1024, 'patch_height': 28, 'patch_width': 32, 'dim': 256, 'num_classes': 57, 'encoder': {'_target_': 'text_recognizer.network.transformer.encoder.Encoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'decoder': {'_target_': 'text_recognizer.network.transformer.decoder.Decoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'token_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.token.TokenEmbedding', 'num_tokens': 57, 'dim': 256, 'use_l2': True}, 'pos_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.absolute.AbsolutePositionalEmbedding', 'dim': 256, 'max_length': 89, 'use_l2': True}, 'tie_embeddings': True, 'pad_index': 3}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cfg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "net = instantiate(cfg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "618b997c-e6a6-4487-b70c-9d260cb556d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchinfo import summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "7daf1f49",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "====================================================================================================\n",
       "Layer (type:depth-idx)                             Output Shape              Param #\n",
       "====================================================================================================\n",
       "VisionTransformer                                  [1, 57, 89]               --\n",
       "├─Sequential: 1-1                                  [1, 64, 256]              --\n",
       "│    └─Rearrange: 2-1                              [1, 64, 896]              --\n",
       "│    └─LayerNorm: 2-2                              [1, 64, 896]              1,792\n",
       "│    └─Linear: 2-3                                 [1, 64, 256]              229,632\n",
       "│    └─LayerNorm: 2-4                              [1, 64, 256]              512\n",
       "├─Encoder: 1-2                                     [1, 64, 256]              --\n",
       "│    └─ModuleList: 2-5                             --                        --\n",
       "│    │    └─ModuleList: 3-1                        --                        --\n",
       "│    │    │    └─Attention: 4-1                    [1, 64, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-2                  [1, 64, 256]              526,080\n",
       "│    │    └─ModuleList: 3-2                        --                        --\n",
       "│    │    │    └─Attention: 4-3                    [1, 64, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-4                  [1, 64, 256]              526,080\n",
       "│    │    └─ModuleList: 3-3                        --                        --\n",
       "│    │    │    └─Attention: 4-5                    [1, 64, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-6                  [1, 64, 256]              526,080\n",
       "│    │    └─ModuleList: 3-4                        --                        --\n",
       "│    │    │    └─Attention: 4-7                    [1, 64, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-8                  [1, 64, 256]              526,080\n",
       "│    │    └─ModuleList: 3-5                        --                        --\n",
       "│    │    │    └─Attention: 4-9                    [1, 64, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-10                 [1, 64, 256]              526,080\n",
       "│    │    └─ModuleList: 3-6                        --                        --\n",
       "│    │    │    └─Attention: 4-11                   [1, 64, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-12                 [1, 64, 256]              526,080\n",
       "│    └─LayerNorm: 2-6                              [1, 64, 256]              512\n",
       "├─TokenEmbedding: 1-3                              [1, 89, 256]              --\n",
       "│    └─Embedding: 2-7                              [1, 89, 256]              14,592\n",
       "├─AbsolutePositionalEmbedding: 1-4                 [89, 256]                 --\n",
       "│    └─Embedding: 2-8                              [89, 256]                 22,784\n",
       "├─Decoder: 1-5                                     [1, 89, 256]              --\n",
       "│    └─ModuleList: 2-9                             --                        --\n",
       "│    │    └─ModuleList: 3-7                        --                        --\n",
       "│    │    │    └─Attention: 4-13                   [1, 89, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-14                 [1, 89, 256]              526,080\n",
       "│    │    │    └─Attention: 4-15                   [1, 89, 256]              525,824\n",
       "│    │    └─ModuleList: 3-8                        --                        --\n",
       "│    │    │    └─Attention: 4-16                   [1, 89, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-17                 [1, 89, 256]              526,080\n",
       "│    │    │    └─Attention: 4-18                   [1, 89, 256]              525,824\n",
       "│    │    └─ModuleList: 3-9                        --                        --\n",
       "│    │    │    └─Attention: 4-19                   [1, 89, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-20                 [1, 89, 256]              526,080\n",
       "│    │    │    └─Attention: 4-21                   [1, 89, 256]              525,824\n",
       "│    │    └─ModuleList: 3-10                       --                        --\n",
       "│    │    │    └─Attention: 4-22                   [1, 89, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-23                 [1, 89, 256]              526,080\n",
       "│    │    │    └─Attention: 4-24                   [1, 89, 256]              525,824\n",
       "│    │    └─ModuleList: 3-11                       --                        --\n",
       "│    │    │    └─Attention: 4-25                   [1, 89, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-26                 [1, 89, 256]              526,080\n",
       "│    │    │    └─Attention: 4-27                   [1, 89, 256]              525,824\n",
       "│    │    └─ModuleList: 3-12                       --                        --\n",
       "│    │    │    └─Attention: 4-28                   [1, 89, 256]              525,824\n",
       "│    │    │    └─FeedForward: 4-29                 [1, 89, 256]              526,080\n",
       "│    │    │    └─Attention: 4-30                   [1, 89, 256]              525,824\n",
       "│    └─LayerNorm: 2-10                             [1, 89, 256]              512\n",
       "====================================================================================================\n",
       "Total params: 16,048,128\n",
       "Trainable params: 16,048,128\n",
       "Non-trainable params: 0\n",
       "Total mult-adds (M): 18.03\n",
       "====================================================================================================\n",
       "Input size (MB): 0.23\n",
       "Forward/backward pass size (MB): 46.52\n",
       "Params size (MB): 64.16\n",
       "Estimated Total Size (MB): 110.91\n",
       "===================================================================================================="
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "1b1a8ac0-bd05-4076-90c2-2de6b740490d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "248a0cb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "t = net(torch.randn(1, 1, 56, 1024), torch.randint(1, 4, (1, 4)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c251a954-00ac-4680-87e4-f27b6ce06023",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 58, 4])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "02d82c5e-4e67-4f87-a539-393e4cf59b6e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "loss = torch.nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "dc836993-a5d8-43b2-b41c-158a17990075",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(4.0604, grad_fn=<NllLoss2DBackward0>)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loss(t.permute(0, 2, 1), torch.randint(0, 58, (1, 89)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "166bf656-aba6-4654-a530-dfce12666297",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}