diff options
-rw-r--r-- | .pre-commit-config.yaml | 6 | ||||
-rw-r--r-- | notebooks/04-convnext.ipynb | 156 | ||||
-rw-r--r-- | notebooks/04-mammut-lines.ipynb | 251 | ||||
-rw-r--r-- | notebooks/04-vit-lines.ipynb | 305 | ||||
-rw-r--r-- | notebooks/Untitled1.ipynb | 111 |
5 files changed, 408 insertions, 421 deletions
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 40f4423..1167048 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,16 +7,12 @@ repos: - id: trailing-whitespace - repo: local hooks: - - id: black - name: black - entry: black - language: system - types: [python] - id: ruff name: ruff entry: ruff language: system types: [python] + args: [--fix, --exit-non-zero-on-fix] - repo: https://github.com/compilerla/conventional-pre-commit rev: v1.2.0 hooks: diff --git a/notebooks/04-convnext.ipynb b/notebooks/04-convnext.ipynb index 5ab71c8..5affe85 100644 --- a/notebooks/04-convnext.ipynb +++ b/notebooks/04-convnext.ipynb @@ -2,19 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "id": "7c02ae76-b540-4b16-9492-e9210b3b9249", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "import os\n", "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n", @@ -37,9 +30,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from hydra import compose, initialize\n", @@ -49,9 +44,11 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "path = \"../training/conf/network/convnext.yaml\"" @@ -59,9 +56,11 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 64, "id": "e52ecb01-c975-4e55-925d-1182c7aea473", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "with open(path, \"rb\") as f:\n", @@ -70,17 +69,19 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 65, "id": "f939aa37-7b1d-45cc-885c-323c4540bda1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { "text/plain": [ - "{'_target_': 'text_recognizer.network.convnext.ConvNext', 'dim': 16, 'dim_mults': [2, 4, 8], 'depths': [3, 3, 6], 'downsampling_factors': [[2, 2], [2, 2], [2, 2]], 'attn': {'_target_': 'text_recognizer.network.convnext.TransformerBlock', 'attn': {'_target_': 'text_recognizer.network.convnext.Attention', 'dim': 128, 'heads': 4, 'dim_head': 64, 'scale': 8}, 'ff': {'_target_': 'text_recognizer.network.convnext.FeedForward', 'dim': 128, 'mult': 4}}}" + "{'_target_': 'text_recognizer.network.convnext.convnext.ConvNext', 'dim': 8, 'dim_mults': [2, 8], 'depths': [2, 2], 'attn': {'_target_': 'text_recognizer.network.convnext.transformer.Transformer', 'attn': {'_target_': 'text_recognizer.network.convnext.transformer.Attention', 'dim': 64, 'heads': 4, 'dim_head': 64, 'scale': 8}, 'ff': {'_target_': 'text_recognizer.network.convnext.transformer.FeedForward', 'dim': 64, 'mult': 4}}}" ] }, - "execution_count": 38, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -91,21 +92,11 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "a2b420c1", - "metadata": {}, - "outputs": [], - "source": [ - "cfg.dim_mults = [2, 4, 8, 8]\n", - "cfg.depths = [3, 3, 6, 6]\n", - "cfg.downsampling_factors = [[2, 2], [2, 2], [2, 2], [2, 1]]" - ] - }, - { - "cell_type": "code", - "execution_count": 39, + "execution_count": 66, "id": "c9589350", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "net = instantiate(cfg)" @@ -113,9 +104,11 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 67, "id": "618b997c-e6a6-4487-b70c-9d260cb556d3", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from torchinfo import summary" @@ -123,11 +116,9 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 68, "id": "25759b7b-8deb-4163-b75d-a1357c9fe88f", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -135,84 +126,31 @@ "====================================================================================================\n", "Layer (type:depth-idx) Output Shape Param #\n", "====================================================================================================\n", - "ConvNext [2, 128, 72, 80] 165,408\n", - "├─TransformerBlock: 1-5 [2, 128, 72, 80] (recursive)\n", - "│ └─Attention: 2-6 [2, 128, 72, 80] (recursive)\n", - "│ │ └─LayerNorm: 3-13 [2, 128, 72, 80] (recursive)\n", - "├─Conv2d: 1-3 [2, 16, 576, 640] (recursive)\n", - "├─TransformerBlock: 1 -- --\n", - "│ └─Attention: 2 -- --\n", - "│ │ └─Conv2d: 3-15 [2, 128, 72, 80] (recursive)\n", - "│ └─FeedForward: 2-7 [2, 128, 72, 80] (recursive)\n", - "│ │ └─Residual: 3-16 [2, 128, 72, 80] (recursive)\n", - "│ │ │ └─Sequential: 4-26 [2, 128, 72, 80] (recursive)\n", - "├─Conv2d: 1-3 [2, 16, 576, 640] (recursive)\n", - "├─ModuleList: 1-4 -- --\n", - "│ └─ModuleList: 2-3 -- --\n", - "│ │ └─ConvNextBlock: 3-4 [2, 16, 576, 640] --\n", - "│ │ │ └─Conv2d: 4-2 [2, 16, 576, 640] 800\n", - "│ │ │ └─Sequential: 4-3 [2, 16, 576, 640] 9,280\n", - "│ │ │ └─Identity: 4-4 [2, 16, 576, 640] --\n", - "│ │ └─ModuleList: 3-5 -- --\n", - "│ │ │ └─ConvNextBlock: 4-5 [2, 16, 576, 640] 10,080\n", - "│ │ │ └─ConvNextBlock: 4-6 [2, 16, 576, 640] 10,080\n", - "│ │ │ └─ConvNextBlock: 4-7 [2, 16, 576, 640] 10,080\n", - "│ │ └─Downsample: 3-6 [2, 32, 288, 320] --\n", - "│ │ │ └─Sequential: 4-8 [2, 32, 288, 320] 2,080\n", - "│ └─ModuleList: 2-4 -- --\n", - "│ │ └─ConvNextBlock: 3-7 [2, 32, 288, 320] --\n", - "│ │ │ └─Conv2d: 4-9 [2, 32, 288, 320] 1,600\n", - "│ │ │ └─Sequential: 4-10 [2, 32, 288, 320] 36,992\n", - "│ │ │ └─Identity: 4-11 [2, 32, 288, 320] --\n", - "│ │ └─ModuleList: 3-8 -- --\n", - "│ │ │ └─ConvNextBlock: 4-12 [2, 32, 288, 320] 38,592\n", - "│ │ │ └─ConvNextBlock: 4-13 [2, 32, 288, 320] 38,592\n", - "│ │ │ └─ConvNextBlock: 4-14 [2, 32, 288, 320] 38,592\n", - "│ │ └─Downsample: 3-9 [2, 64, 144, 160] --\n", - "│ │ │ └─Sequential: 4-15 [2, 64, 144, 160] 8,256\n", - "│ └─ModuleList: 2-5 -- --\n", - "│ │ └─ConvNextBlock: 3-10 [2, 64, 144, 160] --\n", - "│ │ │ └─Conv2d: 4-16 [2, 64, 144, 160] 3,200\n", - "│ │ │ └─Sequential: 4-17 [2, 64, 144, 160] 147,712\n", - "│ │ │ └─Identity: 4-18 [2, 64, 144, 160] --\n", - "│ │ └─ModuleList: 3-11 -- --\n", - "│ │ │ └─ConvNextBlock: 4-19 [2, 64, 144, 160] 150,912\n", - "│ │ │ └─ConvNextBlock: 4-20 [2, 64, 144, 160] 150,912\n", - "│ │ │ └─ConvNextBlock: 4-21 [2, 64, 144, 160] 150,912\n", - "│ │ │ └─ConvNextBlock: 4-22 [2, 64, 144, 160] 150,912\n", - "│ │ │ └─ConvNextBlock: 4-23 [2, 64, 144, 160] 150,912\n", - "│ │ │ └─ConvNextBlock: 4-24 [2, 64, 144, 160] 150,912\n", - "│ │ └─Downsample: 3-12 [2, 128, 72, 80] --\n", - "│ │ │ └─Sequential: 4-25 [2, 128, 72, 80] 32,896\n", - "├─TransformerBlock: 1-5 [2, 128, 72, 80] (recursive)\n", - "│ └─Attention: 2-6 [2, 128, 72, 80] (recursive)\n", - "│ │ └─LayerNorm: 3-13 [2, 128, 72, 80] (recursive)\n", - "│ │ └─Conv2d: 3-14 [2, 768, 72, 80] 98,304\n", - "│ │ └─Conv2d: 3-15 [2, 128, 72, 80] (recursive)\n", - "│ └─FeedForward: 2-7 [2, 128, 72, 80] (recursive)\n", - "│ │ └─Residual: 3-16 [2, 128, 72, 80] (recursive)\n", - "│ │ │ └─Sequential: 4-26 [2, 128, 72, 80] (recursive)\n", - "├─LayerNorm: 1-6 [2, 128, 72, 80] 128\n", + "ConvNext [2, 64, 14, 256] --\n", + "├─Conv2d: 1-1 [2, 8, 56, 1024] 400\n", + "├─ModuleList: 1-2 -- 30,512\n", + "├─Transformer: 1-3 [2, 64, 14, 256] 98,688\n", + "├─LayerNorm: 1-4 [2, 64, 14, 256] 64\n", "====================================================================================================\n", - "Total params: 1,558,144\n", - "Trainable params: 1,558,144\n", + "Total params: 129,664\n", + "Trainable params: 129,664\n", "Non-trainable params: 0\n", - "Total mult-adds (G): 114.00\n", + "Total mult-adds (G): 2.00\n", "====================================================================================================\n", - "Input size (MB): 2.95\n", - "Forward/backward pass size (MB): 3822.06\n", - "Params size (MB): 5.57\n", - "Estimated Total Size (MB): 3830.58\n", + "Input size (MB): 0.46\n", + "Forward/backward pass size (MB): 260.57\n", + "Params size (MB): 0.52\n", + "Estimated Total Size (MB): 261.55\n", "====================================================================================================" ] }, - "execution_count": 41, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "summary(net, (2, 1, 576, 640), device=\"cpu\", depth=4)" + "summary(net, (2, 1, 56, 1024), device=\"cpu\", depth=1)" ] }, { diff --git a/notebooks/04-mammut-lines.ipynb b/notebooks/04-mammut-lines.ipynb new file mode 100644 index 0000000..b0690f7 --- /dev/null +++ b/notebooks/04-mammut-lines.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "7c02ae76-b540-4b16-9492-e9210b3b9249", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n", + "import random\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import numpy as np\n", + "from omegaconf import OmegaConf\n", + "import torch\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from importlib.util import find_spec\n", + "if find_spec(\"text_recognizer\") is None:\n", + " import sys\n", + " sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a", + "metadata": {}, + "outputs": [], + "source": [ + "from hydra import compose, initialize\n", + "from omegaconf import OmegaConf\n", + "from hydra.utils import instantiate" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7", + "metadata": {}, + "outputs": [], + "source": [ + "path = \"../training/conf/network/mammut_lines.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "e52ecb01-c975-4e55-925d-1182c7aea473", + "metadata": {}, + "outputs": [], + "source": [ + "with open(path, \"rb\") as f:\n", + " cfg = OmegaConf.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "f939aa37-7b1d-45cc-885c-323c4540bda1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_target_': 'text_recognizer.network.mammut.MaMMUT', 'encoder': {'_target_': 'text_recognizer.network.vit.Vit', 'image_height': 56, 'image_width': 1024, 'patch_height': 56, 'patch_width': 8, 'dim': 512, 'encoder': {'_target_': 'text_recognizer.network.transformer.encoder.Encoder', 'dim': 512, 'heads': 12, 'dim_head': 64, 'ff_mult': 4, 'depth': 4, 'dropout_rate': 0.1}, 'channels': 1}, 'image_attn_pool': {'_target_': 'text_recognizer.network.transformer.attention.Attention', 'dim': 512, 'heads': 8, 'causal': False, 'dim_head': 64, 'ff_mult': 4, 'dropout_rate': 0.0, 'use_flash': True, 'norm_context': True, 'rotary_emb': None}, 'decoder': {'_target_': 'text_recognizer.network.transformer.decoder.Decoder', 'dim': 512, 'ff_mult': 4, 'heads': 12, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.1}, 'dim': 512, 'dim_latent': 512, 'num_tokens': 58, 'pad_index': 3, 'num_image_queries': 256}" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cfg" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0", + "metadata": {}, + "outputs": [], + "source": [ + "net = instantiate(cfg)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "618b997c-e6a6-4487-b70c-9d260cb556d3", + "metadata": {}, + "outputs": [], + "source": [ + "from torchinfo import summary" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "7daf1f49", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "====================================================================================================\n", + "Layer (type:depth-idx) Output Shape Param #\n", + "====================================================================================================\n", + "MaMMUT [1, 89, 58] 627,712\n", + "├─Vit: 1-1 [1, 128, 512] --\n", + "│ └─Sequential: 2-1 [1, 128, 512] --\n", + "│ │ └─Rearrange: 3-1 [1, 128, 448] --\n", + "│ │ └─LayerNorm: 3-2 [1, 128, 448] 896\n", + "│ │ └─Linear: 3-3 [1, 128, 512] 229,888\n", + "│ │ └─LayerNorm: 3-4 [1, 128, 512] 1,024\n", + "│ └─Encoder: 2-2 [1, 128, 512] --\n", + "│ │ └─ModuleList: 3-5 -- --\n", + "│ │ │ └─Attention: 4-1 [1, 128, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-2 [1, 128, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-3 [1, 128, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-4 [1, 128, 512] 4,724,224\n", + "│ │ └─LayerNorm: 3-6 [1, 128, 512] 1,024\n", + "├─Attention: 1-2 [1, 257, 512] --\n", + "│ └─LayerNorm: 2-3 [1, 257, 512] 1,024\n", + "│ └─Linear: 2-4 [1, 257, 512] 262,144\n", + "│ └─LayerNorm: 2-5 [1, 128, 512] 1,024\n", + "│ └─Linear: 2-6 [1, 128, 1024] 524,288\n", + "│ └─Attend: 2-7 [1, 8, 257, 64] --\n", + "│ └─Linear: 2-8 [1, 257, 512] 262,144\n", + "│ └─Sequential: 2-9 [1, 257, 512] --\n", + "│ │ └─Linear: 3-7 [1, 257, 4096] 2,101,248\n", + "│ │ └─SwiGLU: 3-8 [1, 257, 2048] --\n", + "│ │ └─Linear: 3-9 [1, 257, 512] 1,049,088\n", + "├─LayerNorm: 1-3 [1, 257, 512] 1,024\n", + "├─Embedding: 1-4 [1, 89, 512] 29,696\n", + "├─Decoder: 1-5 [1, 89, 512] --\n", + "│ └─ModuleList: 2-10 -- --\n", + "│ │ └─ModuleList: 3-10 -- --\n", + "│ │ │ └─Attention: 4-5 [1, 89, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-6 [1, 89, 512] 4,724,224\n", + "│ │ └─ModuleList: 3-11 -- --\n", + "│ │ │ └─Attention: 4-7 [1, 89, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-8 [1, 89, 512] 4,724,224\n", + "│ │ └─ModuleList: 3-12 -- --\n", + "│ │ │ └─Attention: 4-9 [1, 89, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-10 [1, 89, 512] 4,724,224\n", + "│ │ └─ModuleList: 3-13 -- --\n", + "│ │ │ └─Attention: 4-11 [1, 89, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-12 [1, 89, 512] 4,724,224\n", + "│ │ └─ModuleList: 3-14 -- --\n", + "│ │ │ └─Attention: 4-13 [1, 89, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-14 [1, 89, 512] 4,724,224\n", + "│ │ └─ModuleList: 3-15 -- --\n", + "│ │ │ └─Attention: 4-15 [1, 89, 512] 4,724,224\n", + "│ │ │ └─Attention: 4-16 [1, 89, 512] 4,724,224\n", + "│ └─LayerNorm: 2-11 [1, 89, 512] 1,024\n", + "├─Sequential: 1-6 [1, 89, 58] --\n", + "│ └─LayerNorm: 2-12 [1, 89, 512] 1,024\n", + "│ └─Linear: 2-13 [1, 89, 58] 29,696\n", + "====================================================================================================\n", + "Total params: 80,711,552\n", + "Trainable params: 80,711,552\n", + "Non-trainable params: 0\n", + "Total mult-adds (M): 80.08\n", + "====================================================================================================\n", + "Input size (MB): 0.23\n", + "Forward/backward pass size (MB): 131.05\n", + "Params size (MB): 320.34\n", + "Estimated Total Size (MB): 451.61\n", + "====================================================================================================" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "166bf656-aba6-4654-a530-dfce12666297", + "metadata": {}, + "outputs": [], + "source": [ + "t = net(torch.randn(1, 1, 56, 1024), torch.randint(1, 4, (1, 4)))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "43d9af25-9872-497d-8796-4835a65262ed", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 4, 58])" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63ac7f1b-0eb1-4625-96b8-467846eb7ae6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/04-vit-lines.ipynb b/notebooks/04-vit-lines.ipynb deleted file mode 100644 index b87f38c..0000000 --- a/notebooks/04-vit-lines.ipynb +++ /dev/null @@ -1,305 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "7c02ae76-b540-4b16-9492-e9210b3b9249", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n", - "import random\n", - "\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "import numpy as np\n", - "from omegaconf import OmegaConf\n", - "import torch\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "from importlib.util import find_spec\n", - "if find_spec(\"text_recognizer\") is None:\n", - " import sys\n", - " sys.path.append('..')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a", - "metadata": {}, - "outputs": [], - "source": [ - "from hydra import compose, initialize\n", - "from omegaconf import OmegaConf\n", - "from hydra.utils import instantiate" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7", - "metadata": {}, - "outputs": [], - "source": [ - "path = \"../training/conf/network/vit_lines.yaml\"" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "e52ecb01-c975-4e55-925d-1182c7aea473", - "metadata": {}, - "outputs": [], - "source": [ - "with open(path, \"rb\") as f:\n", - " cfg = OmegaConf.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "f939aa37-7b1d-45cc-885c-323c4540bda1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_target_': 'text_recognizer.network.vit.VisionTransformer', 'image_height': 56, 'image_width': 1024, 'patch_height': 28, 'patch_width': 32, 'dim': 256, 'num_classes': 57, 'encoder': {'_target_': 'text_recognizer.network.transformer.encoder.Encoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'decoder': {'_target_': 'text_recognizer.network.transformer.decoder.Decoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'token_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.token.TokenEmbedding', 'num_tokens': 57, 'dim': 256, 'use_l2': True}, 'pos_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.absolute.AbsolutePositionalEmbedding', 'dim': 256, 'max_length': 89, 'use_l2': True}, 'tie_embeddings': True, 'pad_index': 3}" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cfg" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0", - "metadata": {}, - "outputs": [], - "source": [ - "net = instantiate(cfg)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "618b997c-e6a6-4487-b70c-9d260cb556d3", - "metadata": {}, - "outputs": [], - "source": [ - "from torchinfo import summary" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "7daf1f49", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "====================================================================================================\n", - "Layer (type:depth-idx) Output Shape Param #\n", - "====================================================================================================\n", - "VisionTransformer [1, 57, 89] --\n", - "├─Sequential: 1-1 [1, 64, 256] --\n", - "│ └─Rearrange: 2-1 [1, 64, 896] --\n", - "│ └─LayerNorm: 2-2 [1, 64, 896] 1,792\n", - "│ └─Linear: 2-3 [1, 64, 256] 229,632\n", - "│ └─LayerNorm: 2-4 [1, 64, 256] 512\n", - "├─Encoder: 1-2 [1, 64, 256] --\n", - "│ └─ModuleList: 2-5 -- --\n", - "│ │ └─ModuleList: 3-1 -- --\n", - "│ │ │ └─Attention: 4-1 [1, 64, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-2 [1, 64, 256] 526,080\n", - "│ │ └─ModuleList: 3-2 -- --\n", - "│ │ │ └─Attention: 4-3 [1, 64, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-4 [1, 64, 256] 526,080\n", - "│ │ └─ModuleList: 3-3 -- --\n", - "│ │ │ └─Attention: 4-5 [1, 64, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-6 [1, 64, 256] 526,080\n", - "│ │ └─ModuleList: 3-4 -- --\n", - "│ │ │ └─Attention: 4-7 [1, 64, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-8 [1, 64, 256] 526,080\n", - "│ │ └─ModuleList: 3-5 -- --\n", - "│ │ │ └─Attention: 4-9 [1, 64, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-10 [1, 64, 256] 526,080\n", - "│ │ └─ModuleList: 3-6 -- --\n", - "│ │ │ └─Attention: 4-11 [1, 64, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-12 [1, 64, 256] 526,080\n", - "│ └─LayerNorm: 2-6 [1, 64, 256] 512\n", - "├─TokenEmbedding: 1-3 [1, 89, 256] --\n", - "│ └─Embedding: 2-7 [1, 89, 256] 14,592\n", - "├─AbsolutePositionalEmbedding: 1-4 [89, 256] --\n", - "│ └─Embedding: 2-8 [89, 256] 22,784\n", - "├─Decoder: 1-5 [1, 89, 256] --\n", - "│ └─ModuleList: 2-9 -- --\n", - "│ │ └─ModuleList: 3-7 -- --\n", - "│ │ │ └─Attention: 4-13 [1, 89, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-14 [1, 89, 256] 526,080\n", - "│ │ │ └─Attention: 4-15 [1, 89, 256] 525,824\n", - "│ │ └─ModuleList: 3-8 -- --\n", - "│ │ │ └─Attention: 4-16 [1, 89, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-17 [1, 89, 256] 526,080\n", - "│ │ │ └─Attention: 4-18 [1, 89, 256] 525,824\n", - "│ │ └─ModuleList: 3-9 -- --\n", - "│ │ │ └─Attention: 4-19 [1, 89, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-20 [1, 89, 256] 526,080\n", - "│ │ │ └─Attention: 4-21 [1, 89, 256] 525,824\n", - "│ │ └─ModuleList: 3-10 -- --\n", - "│ │ │ └─Attention: 4-22 [1, 89, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-23 [1, 89, 256] 526,080\n", - "│ │ │ └─Attention: 4-24 [1, 89, 256] 525,824\n", - "│ │ └─ModuleList: 3-11 -- --\n", - "│ │ │ └─Attention: 4-25 [1, 89, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-26 [1, 89, 256] 526,080\n", - "│ │ │ └─Attention: 4-27 [1, 89, 256] 525,824\n", - "│ │ └─ModuleList: 3-12 -- --\n", - "│ │ │ └─Attention: 4-28 [1, 89, 256] 525,824\n", - "│ │ │ └─FeedForward: 4-29 [1, 89, 256] 526,080\n", - "│ │ │ └─Attention: 4-30 [1, 89, 256] 525,824\n", - "│ └─LayerNorm: 2-10 [1, 89, 256] 512\n", - "====================================================================================================\n", - "Total params: 16,048,128\n", - "Trainable params: 16,048,128\n", - "Non-trainable params: 0\n", - "Total mult-adds (M): 18.03\n", - "====================================================================================================\n", - "Input size (MB): 0.23\n", - "Forward/backward pass size (MB): 46.52\n", - "Params size (MB): 64.16\n", - "Estimated Total Size (MB): 110.91\n", - "====================================================================================================" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "1b1a8ac0-bd05-4076-90c2-2de6b740490d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "248a0cb1", - "metadata": {}, - "outputs": [], - "source": [ - "t = net(torch.randn(1, 1, 56, 1024), torch.randint(1, 4, (1, 4)))" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "c251a954-00ac-4680-87e4-f27b6ce06023", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([1, 58, 4])" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "t.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "02d82c5e-4e67-4f87-a539-393e4cf59b6e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "loss = torch.nn.CrossEntropyLoss()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "dc836993-a5d8-43b2-b41c-158a17990075", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor(4.0604, grad_fn=<NllLoss2DBackward0>)" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "loss(t.permute(0, 2, 1), torch.randint(0, 58, (1, 89)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "166bf656-aba6-4654-a530-dfce12666297", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/Untitled1.ipynb b/notebooks/Untitled1.ipynb index 92b35c9..d996e98 100644 --- a/notebooks/Untitled1.ipynb +++ b/notebooks/Untitled1.ipynb @@ -2,12 +2,24 @@ "cells": [ { "cell_type": "code", - "execution_count": 73, + "execution_count": 14, "id": "a15a452c-bbbc-4227-90fb-ad573f82c43f", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'text_recognizer.network.transformer.embeddings'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[14], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext_recognizer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miam_lines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IAMLines\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext_recognizer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnetwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtransformer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01membeddings\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msincos\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m sincos_2d\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'text_recognizer.network.transformer.embeddings'" + ] + } + ], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", @@ -430,6 +442,101 @@ "plt.figure(figsize=(40, 20))\n", "plt.imshow(xxx, cmap='gray')" ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4535cb1c-fc97-4393-8348-649d06030931", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conv = torch.nn.Conv2d(1, 1, kernel_size=3, stride=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "d3092482-a2d5-4b52-9967-8d18cab35f42", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 1, 13, 255])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = torch.randn(1, 1, 56, 1024)\n", + "conv(conv(x)).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "44183fe9-f798-4755-8db5-491b8a3dbf2f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[True, True, True],\n", + " [True, True, True]])\n" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "source = torch.randn((2,3)).bool()\n", + "print(source)\n", + "# now we expand to size (7, 11) by appending a row of 0s at pos 0 and pos 6, \n", + "# and a column of 0s at pos 10\n", + "result = F.pad(source, pad=(0, 1), value=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "34220755-4224-458e-94b8-ea1fa0488a65", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ True, True, True, False],\n", + " [ True, True, True, False]])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfc269b8-8242-41ee-9190-73df0af47e83", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { |