summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.pre-commit-config.yaml6
-rw-r--r--notebooks/04-convnext.ipynb156
-rw-r--r--notebooks/04-mammut-lines.ipynb251
-rw-r--r--notebooks/04-vit-lines.ipynb305
-rw-r--r--notebooks/Untitled1.ipynb111
5 files changed, 408 insertions, 421 deletions
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 40f4423..1167048 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,16 +7,12 @@ repos:
- id: trailing-whitespace
- repo: local
hooks:
- - id: black
- name: black
- entry: black
- language: system
- types: [python]
- id: ruff
name: ruff
entry: ruff
language: system
types: [python]
+ args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/compilerla/conventional-pre-commit
rev: v1.2.0
hooks:
diff --git a/notebooks/04-convnext.ipynb b/notebooks/04-convnext.ipynb
index 5ab71c8..5affe85 100644
--- a/notebooks/04-convnext.ipynb
+++ b/notebooks/04-convnext.ipynb
@@ -2,19 +2,12 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 1,
"id": "7c02ae76-b540-4b16-9492-e9210b3b9249",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The autoreload extension is already loaded. To reload it, use:\n",
- " %reload_ext autoreload\n"
- ]
- }
- ],
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
"source": [
"import os\n",
"os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
@@ -37,9 +30,11 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 2,
"id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a",
- "metadata": {},
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
"from hydra import compose, initialize\n",
@@ -49,9 +44,11 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 3,
"id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7",
- "metadata": {},
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
"path = \"../training/conf/network/convnext.yaml\""
@@ -59,9 +56,11 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 64,
"id": "e52ecb01-c975-4e55-925d-1182c7aea473",
- "metadata": {},
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
"with open(path, \"rb\") as f:\n",
@@ -70,17 +69,19 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 65,
"id": "f939aa37-7b1d-45cc-885c-323c4540bda1",
- "metadata": {},
+ "metadata": {
+ "tags": []
+ },
"outputs": [
{
"data": {
"text/plain": [
- "{'_target_': 'text_recognizer.network.convnext.ConvNext', 'dim': 16, 'dim_mults': [2, 4, 8], 'depths': [3, 3, 6], 'downsampling_factors': [[2, 2], [2, 2], [2, 2]], 'attn': {'_target_': 'text_recognizer.network.convnext.TransformerBlock', 'attn': {'_target_': 'text_recognizer.network.convnext.Attention', 'dim': 128, 'heads': 4, 'dim_head': 64, 'scale': 8}, 'ff': {'_target_': 'text_recognizer.network.convnext.FeedForward', 'dim': 128, 'mult': 4}}}"
+ "{'_target_': 'text_recognizer.network.convnext.convnext.ConvNext', 'dim': 8, 'dim_mults': [2, 8], 'depths': [2, 2], 'attn': {'_target_': 'text_recognizer.network.convnext.transformer.Transformer', 'attn': {'_target_': 'text_recognizer.network.convnext.transformer.Attention', 'dim': 64, 'heads': 4, 'dim_head': 64, 'scale': 8}, 'ff': {'_target_': 'text_recognizer.network.convnext.transformer.FeedForward', 'dim': 64, 'mult': 4}}}"
]
},
- "execution_count": 38,
+ "execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
@@ -91,21 +92,11 @@
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "a2b420c1",
- "metadata": {},
- "outputs": [],
- "source": [
- "cfg.dim_mults = [2, 4, 8, 8]\n",
- "cfg.depths = [3, 3, 6, 6]\n",
- "cfg.downsampling_factors = [[2, 2], [2, 2], [2, 2], [2, 1]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
+ "execution_count": 66,
"id": "c9589350",
- "metadata": {},
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
"net = instantiate(cfg)"
@@ -113,9 +104,11 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 67,
"id": "618b997c-e6a6-4487-b70c-9d260cb556d3",
- "metadata": {},
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
"from torchinfo import summary"
@@ -123,11 +116,9 @@
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 68,
"id": "25759b7b-8deb-4163-b75d-a1357c9fe88f",
- "metadata": {
- "scrolled": false
- },
+ "metadata": {},
"outputs": [
{
"data": {
@@ -135,84 +126,31 @@
"====================================================================================================\n",
"Layer (type:depth-idx) Output Shape Param #\n",
"====================================================================================================\n",
- "ConvNext [2, 128, 72, 80] 165,408\n",
- "├─TransformerBlock: 1-5 [2, 128, 72, 80] (recursive)\n",
- "│ └─Attention: 2-6 [2, 128, 72, 80] (recursive)\n",
- "│ │ └─LayerNorm: 3-13 [2, 128, 72, 80] (recursive)\n",
- "├─Conv2d: 1-3 [2, 16, 576, 640] (recursive)\n",
- "├─TransformerBlock: 1 -- --\n",
- "│ └─Attention: 2 -- --\n",
- "│ │ └─Conv2d: 3-15 [2, 128, 72, 80] (recursive)\n",
- "│ └─FeedForward: 2-7 [2, 128, 72, 80] (recursive)\n",
- "│ │ └─Residual: 3-16 [2, 128, 72, 80] (recursive)\n",
- "│ │ │ └─Sequential: 4-26 [2, 128, 72, 80] (recursive)\n",
- "├─Conv2d: 1-3 [2, 16, 576, 640] (recursive)\n",
- "├─ModuleList: 1-4 -- --\n",
- "│ └─ModuleList: 2-3 -- --\n",
- "│ │ └─ConvNextBlock: 3-4 [2, 16, 576, 640] --\n",
- "│ │ │ └─Conv2d: 4-2 [2, 16, 576, 640] 800\n",
- "│ │ │ └─Sequential: 4-3 [2, 16, 576, 640] 9,280\n",
- "│ │ │ └─Identity: 4-4 [2, 16, 576, 640] --\n",
- "│ │ └─ModuleList: 3-5 -- --\n",
- "│ │ │ └─ConvNextBlock: 4-5 [2, 16, 576, 640] 10,080\n",
- "│ │ │ └─ConvNextBlock: 4-6 [2, 16, 576, 640] 10,080\n",
- "│ │ │ └─ConvNextBlock: 4-7 [2, 16, 576, 640] 10,080\n",
- "│ │ └─Downsample: 3-6 [2, 32, 288, 320] --\n",
- "│ │ │ └─Sequential: 4-8 [2, 32, 288, 320] 2,080\n",
- "│ └─ModuleList: 2-4 -- --\n",
- "│ │ └─ConvNextBlock: 3-7 [2, 32, 288, 320] --\n",
- "│ │ │ └─Conv2d: 4-9 [2, 32, 288, 320] 1,600\n",
- "│ │ │ └─Sequential: 4-10 [2, 32, 288, 320] 36,992\n",
- "│ │ │ └─Identity: 4-11 [2, 32, 288, 320] --\n",
- "│ │ └─ModuleList: 3-8 -- --\n",
- "│ │ │ └─ConvNextBlock: 4-12 [2, 32, 288, 320] 38,592\n",
- "│ │ │ └─ConvNextBlock: 4-13 [2, 32, 288, 320] 38,592\n",
- "│ │ │ └─ConvNextBlock: 4-14 [2, 32, 288, 320] 38,592\n",
- "│ │ └─Downsample: 3-9 [2, 64, 144, 160] --\n",
- "│ │ │ └─Sequential: 4-15 [2, 64, 144, 160] 8,256\n",
- "│ └─ModuleList: 2-5 -- --\n",
- "│ │ └─ConvNextBlock: 3-10 [2, 64, 144, 160] --\n",
- "│ │ │ └─Conv2d: 4-16 [2, 64, 144, 160] 3,200\n",
- "│ │ │ └─Sequential: 4-17 [2, 64, 144, 160] 147,712\n",
- "│ │ │ └─Identity: 4-18 [2, 64, 144, 160] --\n",
- "│ │ └─ModuleList: 3-11 -- --\n",
- "│ │ │ └─ConvNextBlock: 4-19 [2, 64, 144, 160] 150,912\n",
- "│ │ │ └─ConvNextBlock: 4-20 [2, 64, 144, 160] 150,912\n",
- "│ │ │ └─ConvNextBlock: 4-21 [2, 64, 144, 160] 150,912\n",
- "│ │ │ └─ConvNextBlock: 4-22 [2, 64, 144, 160] 150,912\n",
- "│ │ │ └─ConvNextBlock: 4-23 [2, 64, 144, 160] 150,912\n",
- "│ │ │ └─ConvNextBlock: 4-24 [2, 64, 144, 160] 150,912\n",
- "│ │ └─Downsample: 3-12 [2, 128, 72, 80] --\n",
- "│ │ │ └─Sequential: 4-25 [2, 128, 72, 80] 32,896\n",
- "├─TransformerBlock: 1-5 [2, 128, 72, 80] (recursive)\n",
- "│ └─Attention: 2-6 [2, 128, 72, 80] (recursive)\n",
- "│ │ └─LayerNorm: 3-13 [2, 128, 72, 80] (recursive)\n",
- "│ │ └─Conv2d: 3-14 [2, 768, 72, 80] 98,304\n",
- "│ │ └─Conv2d: 3-15 [2, 128, 72, 80] (recursive)\n",
- "│ └─FeedForward: 2-7 [2, 128, 72, 80] (recursive)\n",
- "│ │ └─Residual: 3-16 [2, 128, 72, 80] (recursive)\n",
- "│ │ │ └─Sequential: 4-26 [2, 128, 72, 80] (recursive)\n",
- "├─LayerNorm: 1-6 [2, 128, 72, 80] 128\n",
+ "ConvNext [2, 64, 14, 256] --\n",
+ "├─Conv2d: 1-1 [2, 8, 56, 1024] 400\n",
+ "├─ModuleList: 1-2 -- 30,512\n",
+ "├─Transformer: 1-3 [2, 64, 14, 256] 98,688\n",
+ "├─LayerNorm: 1-4 [2, 64, 14, 256] 64\n",
"====================================================================================================\n",
- "Total params: 1,558,144\n",
- "Trainable params: 1,558,144\n",
+ "Total params: 129,664\n",
+ "Trainable params: 129,664\n",
"Non-trainable params: 0\n",
- "Total mult-adds (G): 114.00\n",
+ "Total mult-adds (G): 2.00\n",
"====================================================================================================\n",
- "Input size (MB): 2.95\n",
- "Forward/backward pass size (MB): 3822.06\n",
- "Params size (MB): 5.57\n",
- "Estimated Total Size (MB): 3830.58\n",
+ "Input size (MB): 0.46\n",
+ "Forward/backward pass size (MB): 260.57\n",
+ "Params size (MB): 0.52\n",
+ "Estimated Total Size (MB): 261.55\n",
"===================================================================================================="
]
},
- "execution_count": 41,
+ "execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "summary(net, (2, 1, 576, 640), device=\"cpu\", depth=4)"
+ "summary(net, (2, 1, 56, 1024), device=\"cpu\", depth=1)"
]
},
{
diff --git a/notebooks/04-mammut-lines.ipynb b/notebooks/04-mammut-lines.ipynb
new file mode 100644
index 0000000..b0690f7
--- /dev/null
+++ b/notebooks/04-mammut-lines.ipynb
@@ -0,0 +1,251 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "7c02ae76-b540-4b16-9492-e9210b3b9249",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
+ "import random\n",
+ "\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import numpy as np\n",
+ "from omegaconf import OmegaConf\n",
+ "import torch\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "from importlib.util import find_spec\n",
+ "if find_spec(\"text_recognizer\") is None:\n",
+ " import sys\n",
+ " sys.path.append('..')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from hydra import compose, initialize\n",
+ "from omegaconf import OmegaConf\n",
+ "from hydra.utils import instantiate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path = \"../training/conf/network/mammut_lines.yaml\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "e52ecb01-c975-4e55-925d-1182c7aea473",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(path, \"rb\") as f:\n",
+ " cfg = OmegaConf.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "f939aa37-7b1d-45cc-885c-323c4540bda1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'_target_': 'text_recognizer.network.mammut.MaMMUT', 'encoder': {'_target_': 'text_recognizer.network.vit.Vit', 'image_height': 56, 'image_width': 1024, 'patch_height': 56, 'patch_width': 8, 'dim': 512, 'encoder': {'_target_': 'text_recognizer.network.transformer.encoder.Encoder', 'dim': 512, 'heads': 12, 'dim_head': 64, 'ff_mult': 4, 'depth': 4, 'dropout_rate': 0.1}, 'channels': 1}, 'image_attn_pool': {'_target_': 'text_recognizer.network.transformer.attention.Attention', 'dim': 512, 'heads': 8, 'causal': False, 'dim_head': 64, 'ff_mult': 4, 'dropout_rate': 0.0, 'use_flash': True, 'norm_context': True, 'rotary_emb': None}, 'decoder': {'_target_': 'text_recognizer.network.transformer.decoder.Decoder', 'dim': 512, 'ff_mult': 4, 'heads': 12, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.1}, 'dim': 512, 'dim_latent': 512, 'num_tokens': 58, 'pad_index': 3, 'num_image_queries': 256}"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cfg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "net = instantiate(cfg)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "618b997c-e6a6-4487-b70c-9d260cb556d3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torchinfo import summary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "7daf1f49",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "====================================================================================================\n",
+ "Layer (type:depth-idx) Output Shape Param #\n",
+ "====================================================================================================\n",
+ "MaMMUT [1, 89, 58] 627,712\n",
+ "├─Vit: 1-1 [1, 128, 512] --\n",
+ "│ └─Sequential: 2-1 [1, 128, 512] --\n",
+ "│ │ └─Rearrange: 3-1 [1, 128, 448] --\n",
+ "│ │ └─LayerNorm: 3-2 [1, 128, 448] 896\n",
+ "│ │ └─Linear: 3-3 [1, 128, 512] 229,888\n",
+ "│ │ └─LayerNorm: 3-4 [1, 128, 512] 1,024\n",
+ "│ └─Encoder: 2-2 [1, 128, 512] --\n",
+ "│ │ └─ModuleList: 3-5 -- --\n",
+ "│ │ │ └─Attention: 4-1 [1, 128, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-2 [1, 128, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-3 [1, 128, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-4 [1, 128, 512] 4,724,224\n",
+ "│ │ └─LayerNorm: 3-6 [1, 128, 512] 1,024\n",
+ "├─Attention: 1-2 [1, 257, 512] --\n",
+ "│ └─LayerNorm: 2-3 [1, 257, 512] 1,024\n",
+ "│ └─Linear: 2-4 [1, 257, 512] 262,144\n",
+ "│ └─LayerNorm: 2-5 [1, 128, 512] 1,024\n",
+ "│ └─Linear: 2-6 [1, 128, 1024] 524,288\n",
+ "│ └─Attend: 2-7 [1, 8, 257, 64] --\n",
+ "│ └─Linear: 2-8 [1, 257, 512] 262,144\n",
+ "│ └─Sequential: 2-9 [1, 257, 512] --\n",
+ "│ │ └─Linear: 3-7 [1, 257, 4096] 2,101,248\n",
+ "│ │ └─SwiGLU: 3-8 [1, 257, 2048] --\n",
+ "│ │ └─Linear: 3-9 [1, 257, 512] 1,049,088\n",
+ "├─LayerNorm: 1-3 [1, 257, 512] 1,024\n",
+ "├─Embedding: 1-4 [1, 89, 512] 29,696\n",
+ "├─Decoder: 1-5 [1, 89, 512] --\n",
+ "│ └─ModuleList: 2-10 -- --\n",
+ "│ │ └─ModuleList: 3-10 -- --\n",
+ "│ │ │ └─Attention: 4-5 [1, 89, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-6 [1, 89, 512] 4,724,224\n",
+ "│ │ └─ModuleList: 3-11 -- --\n",
+ "│ │ │ └─Attention: 4-7 [1, 89, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-8 [1, 89, 512] 4,724,224\n",
+ "│ │ └─ModuleList: 3-12 -- --\n",
+ "│ │ │ └─Attention: 4-9 [1, 89, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-10 [1, 89, 512] 4,724,224\n",
+ "│ │ └─ModuleList: 3-13 -- --\n",
+ "│ │ │ └─Attention: 4-11 [1, 89, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-12 [1, 89, 512] 4,724,224\n",
+ "│ │ └─ModuleList: 3-14 -- --\n",
+ "│ │ │ └─Attention: 4-13 [1, 89, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-14 [1, 89, 512] 4,724,224\n",
+ "│ │ └─ModuleList: 3-15 -- --\n",
+ "│ │ │ └─Attention: 4-15 [1, 89, 512] 4,724,224\n",
+ "│ │ │ └─Attention: 4-16 [1, 89, 512] 4,724,224\n",
+ "│ └─LayerNorm: 2-11 [1, 89, 512] 1,024\n",
+ "├─Sequential: 1-6 [1, 89, 58] --\n",
+ "│ └─LayerNorm: 2-12 [1, 89, 512] 1,024\n",
+ "│ └─Linear: 2-13 [1, 89, 58] 29,696\n",
+ "====================================================================================================\n",
+ "Total params: 80,711,552\n",
+ "Trainable params: 80,711,552\n",
+ "Non-trainable params: 0\n",
+ "Total mult-adds (M): 80.08\n",
+ "====================================================================================================\n",
+ "Input size (MB): 0.23\n",
+ "Forward/backward pass size (MB): 131.05\n",
+ "Params size (MB): 320.34\n",
+ "Estimated Total Size (MB): 451.61\n",
+ "===================================================================================================="
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "166bf656-aba6-4654-a530-dfce12666297",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t = net(torch.randn(1, 1, 56, 1024), torch.randint(1, 4, (1, 4)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "43d9af25-9872-497d-8796-4835a65262ed",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 4, 58])"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "t.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "63ac7f1b-0eb1-4625-96b8-467846eb7ae6",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/04-vit-lines.ipynb b/notebooks/04-vit-lines.ipynb
deleted file mode 100644
index b87f38c..0000000
--- a/notebooks/04-vit-lines.ipynb
+++ /dev/null
@@ -1,305 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "7c02ae76-b540-4b16-9492-e9210b3b9249",
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
- "import random\n",
- "\n",
- "%matplotlib inline\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "import numpy as np\n",
- "from omegaconf import OmegaConf\n",
- "import torch\n",
- "%load_ext autoreload\n",
- "%autoreload 2\n",
- "\n",
- "from importlib.util import find_spec\n",
- "if find_spec(\"text_recognizer\") is None:\n",
- " import sys\n",
- " sys.path.append('..')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "ccdb6dde-47e5-429a-88f2-0764fb7e259a",
- "metadata": {},
- "outputs": [],
- "source": [
- "from hydra import compose, initialize\n",
- "from omegaconf import OmegaConf\n",
- "from hydra.utils import instantiate"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7",
- "metadata": {},
- "outputs": [],
- "source": [
- "path = \"../training/conf/network/vit_lines.yaml\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "e52ecb01-c975-4e55-925d-1182c7aea473",
- "metadata": {},
- "outputs": [],
- "source": [
- "with open(path, \"rb\") as f:\n",
- " cfg = OmegaConf.load(f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "f939aa37-7b1d-45cc-885c-323c4540bda1",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'_target_': 'text_recognizer.network.vit.VisionTransformer', 'image_height': 56, 'image_width': 1024, 'patch_height': 28, 'patch_width': 32, 'dim': 256, 'num_classes': 57, 'encoder': {'_target_': 'text_recognizer.network.transformer.encoder.Encoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'decoder': {'_target_': 'text_recognizer.network.transformer.decoder.Decoder', 'dim': 256, 'inner_dim': 1024, 'heads': 8, 'dim_head': 64, 'depth': 6, 'dropout_rate': 0.0}, 'token_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.token.TokenEmbedding', 'num_tokens': 57, 'dim': 256, 'use_l2': True}, 'pos_embedding': {'_target_': 'text_recognizer.network.transformer.embedding.absolute.AbsolutePositionalEmbedding', 'dim': 256, 'max_length': 89, 'use_l2': True}, 'tie_embeddings': True, 'pad_index': 3}"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cfg"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0",
- "metadata": {},
- "outputs": [],
- "source": [
- "net = instantiate(cfg)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "618b997c-e6a6-4487-b70c-9d260cb556d3",
- "metadata": {},
- "outputs": [],
- "source": [
- "from torchinfo import summary"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "7daf1f49",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "====================================================================================================\n",
- "Layer (type:depth-idx) Output Shape Param #\n",
- "====================================================================================================\n",
- "VisionTransformer [1, 57, 89] --\n",
- "├─Sequential: 1-1 [1, 64, 256] --\n",
- "│ └─Rearrange: 2-1 [1, 64, 896] --\n",
- "│ └─LayerNorm: 2-2 [1, 64, 896] 1,792\n",
- "│ └─Linear: 2-3 [1, 64, 256] 229,632\n",
- "│ └─LayerNorm: 2-4 [1, 64, 256] 512\n",
- "├─Encoder: 1-2 [1, 64, 256] --\n",
- "│ └─ModuleList: 2-5 -- --\n",
- "│ │ └─ModuleList: 3-1 -- --\n",
- "│ │ │ └─Attention: 4-1 [1, 64, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-2 [1, 64, 256] 526,080\n",
- "│ │ └─ModuleList: 3-2 -- --\n",
- "│ │ │ └─Attention: 4-3 [1, 64, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-4 [1, 64, 256] 526,080\n",
- "│ │ └─ModuleList: 3-3 -- --\n",
- "│ │ │ └─Attention: 4-5 [1, 64, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-6 [1, 64, 256] 526,080\n",
- "│ │ └─ModuleList: 3-4 -- --\n",
- "│ │ │ └─Attention: 4-7 [1, 64, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-8 [1, 64, 256] 526,080\n",
- "│ │ └─ModuleList: 3-5 -- --\n",
- "│ │ │ └─Attention: 4-9 [1, 64, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-10 [1, 64, 256] 526,080\n",
- "│ │ └─ModuleList: 3-6 -- --\n",
- "│ │ │ └─Attention: 4-11 [1, 64, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-12 [1, 64, 256] 526,080\n",
- "│ └─LayerNorm: 2-6 [1, 64, 256] 512\n",
- "├─TokenEmbedding: 1-3 [1, 89, 256] --\n",
- "│ └─Embedding: 2-7 [1, 89, 256] 14,592\n",
- "├─AbsolutePositionalEmbedding: 1-4 [89, 256] --\n",
- "│ └─Embedding: 2-8 [89, 256] 22,784\n",
- "├─Decoder: 1-5 [1, 89, 256] --\n",
- "│ └─ModuleList: 2-9 -- --\n",
- "│ │ └─ModuleList: 3-7 -- --\n",
- "│ │ │ └─Attention: 4-13 [1, 89, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-14 [1, 89, 256] 526,080\n",
- "│ │ │ └─Attention: 4-15 [1, 89, 256] 525,824\n",
- "│ │ └─ModuleList: 3-8 -- --\n",
- "│ │ │ └─Attention: 4-16 [1, 89, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-17 [1, 89, 256] 526,080\n",
- "│ │ │ └─Attention: 4-18 [1, 89, 256] 525,824\n",
- "│ │ └─ModuleList: 3-9 -- --\n",
- "│ │ │ └─Attention: 4-19 [1, 89, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-20 [1, 89, 256] 526,080\n",
- "│ │ │ └─Attention: 4-21 [1, 89, 256] 525,824\n",
- "│ │ └─ModuleList: 3-10 -- --\n",
- "│ │ │ └─Attention: 4-22 [1, 89, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-23 [1, 89, 256] 526,080\n",
- "│ │ │ └─Attention: 4-24 [1, 89, 256] 525,824\n",
- "│ │ └─ModuleList: 3-11 -- --\n",
- "│ │ │ └─Attention: 4-25 [1, 89, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-26 [1, 89, 256] 526,080\n",
- "│ │ │ └─Attention: 4-27 [1, 89, 256] 525,824\n",
- "│ │ └─ModuleList: 3-12 -- --\n",
- "│ │ │ └─Attention: 4-28 [1, 89, 256] 525,824\n",
- "│ │ │ └─FeedForward: 4-29 [1, 89, 256] 526,080\n",
- "│ │ │ └─Attention: 4-30 [1, 89, 256] 525,824\n",
- "│ └─LayerNorm: 2-10 [1, 89, 256] 512\n",
- "====================================================================================================\n",
- "Total params: 16,048,128\n",
- "Trainable params: 16,048,128\n",
- "Non-trainable params: 0\n",
- "Total mult-adds (M): 18.03\n",
- "====================================================================================================\n",
- "Input size (MB): 0.23\n",
- "Forward/backward pass size (MB): 46.52\n",
- "Params size (MB): 64.16\n",
- "Estimated Total Size (MB): 110.91\n",
- "===================================================================================================="
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "1b1a8ac0-bd05-4076-90c2-2de6b740490d",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "import torch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "248a0cb1",
- "metadata": {},
- "outputs": [],
- "source": [
- "t = net(torch.randn(1, 1, 56, 1024), torch.randint(1, 4, (1, 4)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "c251a954-00ac-4680-87e4-f27b6ce06023",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "torch.Size([1, 58, 4])"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "t.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "02d82c5e-4e67-4f87-a539-393e4cf59b6e",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "loss = torch.nn.CrossEntropyLoss()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "dc836993-a5d8-43b2-b41c-158a17990075",
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "tensor(4.0604, grad_fn=<NllLoss2DBackward0>)"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "loss(t.permute(0, 2, 1), torch.randint(0, 58, (1, 89)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "166bf656-aba6-4654-a530-dfce12666297",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/Untitled1.ipynb b/notebooks/Untitled1.ipynb
index 92b35c9..d996e98 100644
--- a/notebooks/Untitled1.ipynb
+++ b/notebooks/Untitled1.ipynb
@@ -2,12 +2,24 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 73,
+ "execution_count": 14,
"id": "a15a452c-bbbc-4227-90fb-ad573f82c43f",
"metadata": {
"tags": []
},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "ModuleNotFoundError",
+ "evalue": "No module named 'text_recognizer.network.transformer.embeddings'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[14], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext_recognizer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miam_lines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IAMLines\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext_recognizer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnetwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtransformer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01membeddings\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msincos\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m sincos_2d\n",
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'text_recognizer.network.transformer.embeddings'"
+ ]
+ }
+ ],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
@@ -430,6 +442,101 @@
"plt.figure(figsize=(40, 20))\n",
"plt.imshow(xxx, cmap='gray')"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "4535cb1c-fc97-4393-8348-649d06030931",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "conv = torch.nn.Conv2d(1, 1, kernel_size=3, stride=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "d3092482-a2d5-4b52-9967-8d18cab35f42",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 1, 13, 255])"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "x = torch.randn(1, 1, 56, 1024)\n",
+ "conv(conv(x)).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "44183fe9-f798-4755-8db5-491b8a3dbf2f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "tensor([[True, True, True],\n",
+ " [True, True, True]])\n"
+ ]
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "import torch.nn.functional as F\n",
+ "source = torch.randn((2,3)).bool()\n",
+ "print(source)\n",
+ "# now we expand to size (7, 11) by appending a row of 0s at pos 0 and pos 6, \n",
+ "# and a column of 0s at pos 10\n",
+ "result = F.pad(source, pad=(0, 1), value=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "34220755-4224-458e-94b8-ea1fa0488a65",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([[ True, True, True, False],\n",
+ " [ True, True, True, False]])"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dfc269b8-8242-41ee-9190-73df0af47e83",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {