From 9426cc794d8c28a65bbbf5ae5466a0a343078558 Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Sun, 25 Apr 2021 23:32:50 +0200 Subject: Efficient net and non working transformer model. --- notebooks/00-testing-stuff-out.ipynb | 555 ++++++--------------- notebooks/03-look-at-iam-paragraphs.ipynb | 63 ++- text_recognizer/data/iam_extended_paragraphs.py | 12 +- text_recognizer/data/iam_paragraphs.py | 13 +- text_recognizer/data/iam_synthetic_paragraphs.py | 2 +- text_recognizer/data/mappings.py | 12 +- text_recognizer/data/transforms.py | 4 +- text_recognizer/models/transformer.py | 2 +- text_recognizer/networks/__init__.py | 2 + text_recognizer/networks/backbones/__init__.py | 2 + text_recognizer/networks/backbones/efficientnet.py | 145 ++++++ text_recognizer/networks/cnn_transformer.py | 37 +- text_recognizer/networks/coat/__init__.py | 0 text_recognizer/networks/coat/factor_attention.py | 9 - text_recognizer/networks/coat/patch_embedding.py | 38 -- .../networks/coat/positional_encodings.py | 76 --- text_recognizer/networks/residual_network.py | 6 +- text_recognizer/networks/transducer/transducer.py | 7 +- .../networks/transformer/positional_encoding.py | 6 +- .../networks/transformer/rotary_embedding.py | 39 ++ text_recognizer/networks/vqvae/decoder.py | 18 +- text_recognizer/networks/vqvae/encoder.py | 12 +- training/configs/cnn_transformer.yaml | 90 ++++ training/configs/image_transformer.yaml | 89 ---- training/configs/vqvae.yaml | 10 +- 25 files changed, 543 insertions(+), 706 deletions(-) create mode 100644 text_recognizer/networks/backbones/__init__.py create mode 100644 text_recognizer/networks/backbones/efficientnet.py delete mode 100644 text_recognizer/networks/coat/__init__.py delete mode 100644 text_recognizer/networks/coat/factor_attention.py delete mode 100644 text_recognizer/networks/coat/patch_embedding.py delete mode 100644 text_recognizer/networks/coat/positional_encodings.py create mode 100644 text_recognizer/networks/transformer/rotary_embedding.py create mode 100644 training/configs/cnn_transformer.yaml delete mode 100644 training/configs/image_transformer.yaml diff --git a/notebooks/00-testing-stuff-out.ipynb b/notebooks/00-testing-stuff-out.ipynb index e6cf099..7c7b3a6 100644 --- a/notebooks/00-testing-stuff-out.ipynb +++ b/notebooks/00-testing-stuff-out.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -52,110 +52,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "seed: 4711\n", - "network:\n", - " desc: Configuration of the PyTorch neural network.\n", - " type: VQVAE\n", - " args:\n", - " in_channels: 1\n", - " channels:\n", - " - 32\n", - " - 64\n", - " - 96\n", - " - 96\n", - " - 128\n", - " kernel_sizes:\n", - " - 4\n", - " - 4\n", - " - 4\n", - " - 4\n", - " - 4\n", - " strides:\n", - " - 2\n", - " - 2\n", - " - 2\n", - " - 2\n", - " - 2\n", - " num_residual_layers: 2\n", - " embedding_dim: 128\n", - " num_embeddings: 1024\n", - " upsampling: null\n", - " beta: 0.25\n", - " activation: leaky_relu\n", - " dropout_rate: 0.1\n", - "model:\n", - " desc: Configuration of the PyTorch Lightning model.\n", - " type: LitVQVAEModel\n", - " args:\n", - " optimizer:\n", - " type: MADGRAD\n", - " args:\n", - " lr: 0.001\n", - " momentum: 0.9\n", - " weight_decay: 0\n", - " eps: 1.0e-06\n", - " lr_scheduler:\n", - " type: OneCycleLR\n", - " args:\n", - " interval: step\n", - " max_lr: 0.001\n", - " three_phase: true\n", - " epochs: 1024\n", - " steps_per_epoch: 317\n", - " criterion:\n", - " type: MSELoss\n", - " args:\n", - " reduction: mean\n", - " monitor: val_loss\n", - " mapping: sentence_piece\n", - "data:\n", - " desc: Configuration of the training/test data.\n", - " type: IAMExtendedParagraphs\n", - " args:\n", - " batch_size: 64\n", - " num_workers: 12\n", - " train_fraction: 0.8\n", - " augment: true\n", - "callbacks:\n", - "- type: ModelCheckpoint\n", - " args:\n", - " monitor: val_loss\n", - " mode: min\n", - " save_last: true\n", - "- type: LearningRateMonitor\n", - " args:\n", - " logging_interval: step\n", - "trainer:\n", - " desc: Configuration of the PyTorch Lightning Trainer.\n", - " args:\n", - " stochastic_weight_avg: false\n", - " auto_scale_batch_size: binsearch\n", - " gradient_clip_val: 0\n", - " fast_dev_run: false\n", - " gpus: 1\n", - " precision: 16\n", - " max_epochs: 1024\n", - " terminate_on_nan: true\n", - " weights_summary: full\n", - "load_checkpoint: null\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "print(OmegaConf.to_yaml(conf))" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -164,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -173,167 +79,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "VQVAE(\n", - " (encoder): Encoder(\n", - " (encoder): Sequential(\n", - " (0): Sequential(\n", - " (0): Conv2d(1, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (1): Dropout(p=0.1, inplace=False)\n", - " (2): Sequential(\n", - " (0): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " (4): Sequential(\n", - " (0): Conv2d(64, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (5): Dropout(p=0.1, inplace=False)\n", - " (6): Sequential(\n", - " (0): Conv2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (7): Dropout(p=0.1, inplace=False)\n", - " (8): Sequential(\n", - " (0): Conv2d(96, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (9): Dropout(p=0.1, inplace=False)\n", - " (10): _ResidualBlock(\n", - " (block): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", - " (1): ReLU(inplace=True)\n", - " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (11): _ResidualBlock(\n", - " (block): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", - " (1): ReLU(inplace=True)\n", - " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (12): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n", - " )\n", - " (vector_quantizer): VectorQuantizer(\n", - " (embedding): Embedding(1024, 128)\n", - " )\n", - " )\n", - " (decoder): Decoder(\n", - " (res_block): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n", - " (1): _ResidualBlock(\n", - " (block): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", - " (1): ReLU(inplace=True)\n", - " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (2): _ResidualBlock(\n", - " (block): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", - " (1): ReLU(inplace=True)\n", - " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " (upsampling_block): Sequential(\n", - " (0): Sequential(\n", - " (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (1): Dropout(p=0.1, inplace=False)\n", - " (2): Sequential(\n", - " (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " (4): Sequential(\n", - " (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (5): Dropout(p=0.1, inplace=False)\n", - " (6): Sequential(\n", - " (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (7): Dropout(p=0.1, inplace=False)\n", - " (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (9): Tanh()\n", - " )\n", - " (decoder): Sequential(\n", - " (0): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n", - " (1): _ResidualBlock(\n", - " (block): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", - " (1): ReLU(inplace=True)\n", - " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (2): _ResidualBlock(\n", - " (block): Sequential(\n", - " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", - " (1): ReLU(inplace=True)\n", - " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " (1): Sequential(\n", - " (0): Sequential(\n", - " (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (1): Dropout(p=0.1, inplace=False)\n", - " (2): Sequential(\n", - " (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " (4): Sequential(\n", - " (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (5): Dropout(p=0.1, inplace=False)\n", - " (6): Sequential(\n", - " (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", - " )\n", - " (7): Dropout(p=0.1, inplace=False)\n", - " (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", - " (9): Tanh()\n", - " )\n", - " )\n", - " )\n", - ")" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "vae" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -342,275 +97,259 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "proj = nn.Conv2d(1, 32, kernel_size=16, stride=16)" + "vae.encoder(datum)[0].shape" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "x = proj(datum)" + "vae(datum)[0].shape" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 32, 36, 40])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "x.shape" + "from text_recognizer.networks.backbones.efficientnet import EfficientNet" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "xx = x.flatten(2)" + "en = EfficientNet()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 32, 1440])" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "xx.shape" + "datum = torch.randn([2, 1, 576, 640])" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "xxx = xx.transpose(1,2)" + "trg = torch.randint(0, 1000, [2, 682])" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 1440, 32])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "xxx.shape" + "trg.shape" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from einops import rearrange" + "datum = torch.randn([2, 1, 224, 224])" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, + "execution_count": null, + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ - "xxxx = rearrange(x, \"b c h w -> b ( h w ) c\")" + "en(datum).shape" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 1440, 32])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "xxxx.shape" + "path = \"../training/configs/cnn_transformer.yaml\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - " B, N, C = x.shape\n", - " H, W = size\n", - " assert N == 1 + H * W\n", - "\n", - " # Extract CLS token and image tokens.\n", - " cls_token, img_tokens = x[:, :1], x[:, 1:] # Shape: [B, 1, C], [B, H*W, C].\n", - " \n", - " # Depthwise convolution.\n", - " feat = img_tokens.transpose(1, 2).view(B, C, H, W)" + "conf = OmegaConf.load(path)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 7, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "torch.Size([2, 32, 36, 40])" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "seed: 4711\n", + "network:\n", + " desc: Configuration of the PyTorch neural network.\n", + " type: CNNTransformer\n", + " args:\n", + " encoder:\n", + " type: EfficientNet\n", + " args: null\n", + " num_decoder_layers: 4\n", + " hidden_dim: 256\n", + " num_heads: 4\n", + " expansion_dim: 1024\n", + " dropout_rate: 0.1\n", + " transformer_activation: glu\n", + "model:\n", + " desc: Configuration of the PyTorch Lightning model.\n", + " type: LitTransformerModel\n", + " args:\n", + " optimizer:\n", + " type: MADGRAD\n", + " args:\n", + " lr: 0.001\n", + " momentum: 0.9\n", + " weight_decay: 0\n", + " eps: 1.0e-06\n", + " lr_scheduler:\n", + " type: OneCycleLR\n", + " args:\n", + " interval: step\n", + " max_lr: 0.001\n", + " three_phase: true\n", + " epochs: 512\n", + " steps_per_epoch: 1246\n", + " criterion:\n", + " type: CrossEntropyLoss\n", + " args:\n", + " weight: None\n", + " ignore_index: -100\n", + " reduction: mean\n", + " monitor: val_loss\n", + " mapping: sentence_piece\n", + "data:\n", + " desc: Configuration of the training/test data.\n", + " type: IAMExtendedParagraphs\n", + " args:\n", + " batch_size: 16\n", + " num_workers: 12\n", + " train_fraction: 0.8\n", + " augment: true\n", + "callbacks:\n", + "- type: ModelCheckpoint\n", + " args:\n", + " monitor: val_loss\n", + " mode: min\n", + " save_last: true\n", + "- type: StochasticWeightAveraging\n", + " args:\n", + " swa_epoch_start: 0.8\n", + " swa_lrs: 0.05\n", + " annealing_epochs: 10\n", + " annealing_strategy: cos\n", + " device: null\n", + "- type: LearningRateMonitor\n", + " args:\n", + " logging_interval: step\n", + "- type: EarlyStopping\n", + " args:\n", + " monitor: val_loss\n", + " mode: min\n", + " patience: 10\n", + "trainer:\n", + " desc: Configuration of the PyTorch Lightning Trainer.\n", + " args:\n", + " stochastic_weight_avg: true\n", + " auto_scale_batch_size: binsearch\n", + " gradient_clip_val: 0\n", + " fast_dev_run: false\n", + " gpus: 1\n", + " precision: 16\n", + " max_epochs: 512\n", + " terminate_on_nan: true\n", + " weights_summary: true\n", + "load_checkpoint: null\n", + "\n" + ] } ], "source": [ - "xxx.transpose(1, 2).view(2, 32, 36, 40).shape" + "print(OmegaConf.to_yaml(conf))" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "72.0" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 8, + "metadata": {}, + "outputs": [], "source": [ - "576 / 8" + "from text_recognizer.networks.cnn_transformer import CNNTransformer" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "80.0" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "640 / 8" + "t = CNNTransformer(input_shape=(1, 576, 640), output_shape=(682, 1), **conf.network.args)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 1, 576, 640])" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "datum.shape" + "t.encode(datum).shape" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 128, 18, 20])" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "vae.encoder(datum)[0].shape" + "trg.shape" ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "torch.Size([2, 1, 576, 640])" + "torch.Size([2, 682, 1004])" ] }, - "execution_count": 87, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "vae(datum)[0].shape" + "t(datum, trg).shape" ] }, { diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb index eaf5397..add0b80 100644 --- a/notebooks/03-look-at-iam-paragraphs.ipynb +++ b/notebooks/03-look-at-iam-paragraphs.ipynb @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "726ac25b", "metadata": {}, "outputs": [], @@ -56,8 +56,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-04-16 23:01:52.352 | INFO | text_recognizer.data.iam_paragraphs:setup:107 - Loading IAM paragraph regions and lines for None...\n", - "2021-04-16 23:02:08.521 | INFO | text_recognizer.data.iam_synthetic_paragraphs:setup:79 - IAM Synthetic dataset steup for stage None\n" + "2021-04-25 23:17:44.177 | INFO | text_recognizer.data.iam_paragraphs:setup:107 - Loading IAM paragraph regions and lines for None...\n", + "2021-04-25 23:18:00.750 | INFO | text_recognizer.data.iam_synthetic_paragraphs:setup:79 - IAM Synthetic dataset steup for stage None\n" ] }, { @@ -68,9 +68,9 @@ "Num classes: 84\n", "Dims: (1, 576, 640)\n", "Output dims: (682, 1)\n", - "Train/val/test sizes: 19912, 262, 231\n", - "Train Batch x stats: (torch.Size([1, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0043), tensor(0.0333), tensor(0.8588))\n", - "Train Batch y stats: (torch.Size([1, 682]), torch.int64, tensor(1), tensor(78))\n", + "Train/val/test sizes: 19948, 262, 231\n", + "Train Batch x stats: (torch.Size([1, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0109), tensor(0.0499), tensor(0.8314))\n", + "Train Batch y stats: (torch.Size([1, 682]), torch.int64, tensor(1), tensor(83))\n", "Test Batch x stats: (torch.Size([1, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0372), tensor(0.0767), tensor(0.8118))\n", "Test Batch y stats: (torch.Size([1, 682]), torch.int64, tensor(1), tensor(83))\n", "\n" @@ -86,10 +86,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "42501428", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-04-25 23:18:14.449 | INFO | text_recognizer.data.iam_paragraphs:setup:107 - Loading IAM paragraph regions and lines for None...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IAM Paragraphs Dataset\n", + "Num classes: 84\n", + "Input dims: (1, 576, 640)\n", + "Output dims: (682, 1)\n", + "Train/val/test sizes: 1046, 262, 231\n", + "Train Batch x stats: (torch.Size([16, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0393), tensor(0.0924), tensor(1.))\n", + "Train Batch y stats: (torch.Size([16, 682]), torch.int64, tensor(1), tensor(83))\n", + "Test Batch x stats: (torch.Size([16, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0312), tensor(0.0817), tensor(0.9294))\n", + "Test Batch y stats: (torch.Size([16, 682]), torch.int64, tensor(1), tensor(83))\n", + "\n" + ] + } + ], "source": [ "dataset = IAMParagraphs()\n", "dataset.prepare_data()\n", @@ -99,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "0cf22683", "metadata": {}, "outputs": [], @@ -107,6 +131,27 @@ "x, y = dataset.data_train[1]" ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af7747a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([682])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, { "cell_type": "code", "execution_count": 7, diff --git a/text_recognizer/data/iam_extended_paragraphs.py b/text_recognizer/data/iam_extended_paragraphs.py index 2380660..0a30a42 100644 --- a/text_recognizer/data/iam_extended_paragraphs.py +++ b/text_recognizer/data/iam_extended_paragraphs.py @@ -19,18 +19,10 @@ class IAMExtendedParagraphs(BaseDataModule): super().__init__(batch_size, num_workers) self.iam_paragraphs = IAMParagraphs( - batch_size, - num_workers, - train_fraction, - augment, - word_pieces, + batch_size, num_workers, train_fraction, augment, word_pieces, ) self.iam_synthetic_paragraphs = IAMSyntheticParagraphs( - batch_size, - num_workers, - train_fraction, - augment, - word_pieces, + batch_size, num_workers, train_fraction, augment, word_pieces, ) self.dims = self.iam_paragraphs.dims diff --git a/text_recognizer/data/iam_paragraphs.py b/text_recognizer/data/iam_paragraphs.py index 62c44f9..24409bc 100644 --- a/text_recognizer/data/iam_paragraphs.py +++ b/text_recognizer/data/iam_paragraphs.py @@ -101,7 +101,7 @@ class IAMParagraphs(BaseDataModule): data, targets, transform=get_transform(image_shape=self.dims[1:], augment=augment), - target_transform=get_target_transform(self.word_pieces) + target_transform=get_target_transform(self.word_pieces), ) logger.info(f"Loading IAM paragraph regions and lines for {stage}...") @@ -162,10 +162,7 @@ def get_dataset_properties() -> Dict: "min": min(_get_property_values("num_lines")), "max": max(_get_property_values("num_lines")), }, - "crop_shape": { - "min": crop_shapes.min(axis=0), - "max": crop_shapes.max(axis=0), - }, + "crop_shape": {"min": crop_shapes.min(axis=0), "max": crop_shapes.max(axis=0),}, "aspect_ratio": { "min": aspect_ratio.min(axis=0), "max": aspect_ratio.max(axis=0), @@ -286,9 +283,7 @@ def get_transform(image_shape: Tuple[int, int], augment: bool) -> transforms.Com ), transforms.ColorJitter(brightness=(0.8, 1.6)), transforms.RandomAffine( - degrees=1, - shear=(-10, 10), - interpolation=InterpolationMode.BILINEAR, + degrees=1, shear=(-10, 10), interpolation=InterpolationMode.BILINEAR, ), ] else: @@ -296,10 +291,12 @@ def get_transform(image_shape: Tuple[int, int], augment: bool) -> transforms.Com transforms_list.append(transforms.ToTensor()) return transforms.Compose(transforms_list) + def get_target_transform(word_pieces: bool) -> Optional[transforms.Compose]: """Transform emnist characters to word pieces.""" return transforms.Compose([WordPiece()]) if word_pieces else None + def _labels_filename(split: str) -> Path: """Return filename of processed labels.""" return PROCESSED_DATA_DIRNAME / split / "_labels.json" diff --git a/text_recognizer/data/iam_synthetic_paragraphs.py b/text_recognizer/data/iam_synthetic_paragraphs.py index 4ccc5c2..78e6c05 100644 --- a/text_recognizer/data/iam_synthetic_paragraphs.py +++ b/text_recognizer/data/iam_synthetic_paragraphs.py @@ -97,7 +97,7 @@ class IAMSyntheticParagraphs(IAMParagraphs): transform=get_transform( image_shape=self.dims[1:], augment=self.augment ), - target_transform=get_target_transform(self.word_pieces) + target_transform=get_target_transform(self.word_pieces), ) def __repr__(self) -> str: diff --git a/text_recognizer/data/mappings.py b/text_recognizer/data/mappings.py index f4016ba..190febe 100644 --- a/text_recognizer/data/mappings.py +++ b/text_recognizer/data/mappings.py @@ -58,13 +58,13 @@ class WordPieceMapping(EmnistMapping): def __init__( self, num_features: int = 1000, - tokens: str = "iamdb_1kwp_tokens_1000.txt" , + tokens: str = "iamdb_1kwp_tokens_1000.txt", lexicon: str = "iamdb_1kwp_lex_1000.txt", data_dir: Optional[Union[str, Path]] = None, use_words: bool = False, prepend_wordsep: bool = False, special_tokens: Sequence[str] = ("", "", "

"), - extra_symbols: Optional[Sequence[str]] = ("\n", ), + extra_symbols: Optional[Sequence[str]] = ("\n",), ) -> None: super().__init__(extra_symbols) self.wordpiece_processor = self._configure_wordpiece_processor( @@ -90,7 +90,13 @@ class WordPieceMapping(EmnistMapping): extra_symbols: Optional[Sequence[str]], ) -> Preprocessor: data_dir = ( - (Path(__file__).resolve().parents[2] / "data" / "downloaded" / "iam" / "iamdb") + ( + Path(__file__).resolve().parents[2] + / "data" + / "downloaded" + / "iam" + / "iamdb" + ) if data_dir is None else Path(data_dir) ) diff --git a/text_recognizer/data/transforms.py b/text_recognizer/data/transforms.py index 8d1bedd..d0f1f35 100644 --- a/text_recognizer/data/transforms.py +++ b/text_recognizer/data/transforms.py @@ -13,7 +13,7 @@ class WordPiece: def __init__( self, num_features: int = 1000, - tokens: str = "iamdb_1kwp_tokens_1000.txt" , + tokens: str = "iamdb_1kwp_tokens_1000.txt", lexicon: str = "iamdb_1kwp_lex_1000.txt", data_dir: Optional[Union[str, Path]] = None, use_words: bool = False, @@ -35,4 +35,4 @@ class WordPiece: self.max_len = max_len def __call__(self, x: Tensor) -> Tensor: - return self.mapping.emnist_to_wordpiece_indices(x)[:self.max_len] + return self.mapping.emnist_to_wordpiece_indices(x)[: self.max_len] diff --git a/text_recognizer/models/transformer.py b/text_recognizer/models/transformer.py index 7dc1352..8dd4db2 100644 --- a/text_recognizer/models/transformer.py +++ b/text_recognizer/models/transformer.py @@ -39,7 +39,7 @@ class LitTransformerModel(LitBaseModel): def configure_mapping(mapping: Optional[List[str]]) -> Tuple[List[str], List[int]]: """Configure mapping.""" # TODO: Fix me!!! - mapping, inverse_mapping, _ = emnist_mapping() + mapping, inverse_mapping, _ = emnist_mapping(["\n"]) start_index = inverse_mapping[""] end_index = inverse_mapping[""] pad_index = inverse_mapping["

"] diff --git a/text_recognizer/networks/__init__.py b/text_recognizer/networks/__init__.py index 41fd43f..63b43b2 100644 --- a/text_recognizer/networks/__init__.py +++ b/text_recognizer/networks/__init__.py @@ -1,2 +1,4 @@ """Network modules""" +from .backbones import EfficientNet from .vqvae import VQVAE +from .cnn_transformer import CNNTransformer diff --git a/text_recognizer/networks/backbones/__init__.py b/text_recognizer/networks/backbones/__init__.py new file mode 100644 index 0000000..25aed0e --- /dev/null +++ b/text_recognizer/networks/backbones/__init__.py @@ -0,0 +1,2 @@ +"""Vision backbones.""" +from .efficientnet import EfficientNet diff --git a/text_recognizer/networks/backbones/efficientnet.py b/text_recognizer/networks/backbones/efficientnet.py new file mode 100644 index 0000000..61dea77 --- /dev/null +++ b/text_recognizer/networks/backbones/efficientnet.py @@ -0,0 +1,145 @@ +"""Efficient net b0 implementation.""" +import torch +from torch import nn +from torch import Tensor + + +class ConvNorm(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + padding: int, + groups: int = 1, + ) -> None: + super().__init__() + self.block = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ), + nn.BatchNorm2d(num_features=out_channels), + nn.SiLU(inplace=True), + ) + + def forward(self, x: Tensor) -> Tensor: + return self.block(x) + + +class SqueezeExcite(nn.Module): + def __init__(self, in_channels: int, reduce_dim: int) -> None: + super().__init__() + self.se = nn.Sequential( + nn.AdaptiveAvgPool2d(1), # [C, H, W] -> [C, 1, 1] + nn.Conv2d(in_channels=in_channels, out_channels=reduce_dim, kernel_size=1), + nn.SiLU(), + nn.Conv2d(in_channels=reduce_dim, out_channels=in_channels, kernel_size=1), + nn.Sigmoid(), + ) + + def forward(self, x: Tensor) -> Tensor: + return x * self.se(x) + + +class InvertedResidulaBlock(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + padding: int, + expand_ratio: float, + reduction: int = 4, + survival_prob: float = 0.8, + ) -> None: + super().__init__() + self.survival_prob = survival_prob + self.use_residual = in_channels == out_channels and stride == 1 + hidden_dim = in_channels * expand_ratio + self.expand = in_channels != hidden_dim + reduce_dim = in_channels // reduction + + if self.expand: + self.expand_conv = ConvNorm( + in_channels, hidden_dim, kernel_size=3, stride=1, padding=1 + ) + + self.conv = nn.Sequential( + ConvNorm( + hidden_dim, hidden_dim, kernel_size, stride, padding, groups=hidden_dim + ), + SqueezeExcite(hidden_dim, reduce_dim), + nn.Conv2d( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=1, + bias=False, + ), + nn.BatchNorm2d(num_features=out_channels), + ) + + def stochastic_depth(self, x: Tensor) -> Tensor: + if not self.training: + return x + + binary_tensor = ( + torch.rand(x.shape[0], 1, 1, 1, device=x.device) < self.survival_prob + ) + return torch.div(x, self.survival_prob) * binary_tensor + + def forward(self, x: Tensor) -> Tensor: + out = self.expand_conv(x) if self.expand else x + if self.use_residual: + return self.stochastic_depth(self.conv(out)) + x + return self.conv(out) + + +class EfficientNet(nn.Module): + """Efficient net b0 backbone.""" + + def __init__(self) -> None: + super().__init__() + self.base_model = [ + # expand_ratio, channels, repeats, stride, kernel_size + [1, 16, 1, 1, 3], + [6, 24, 2, 2, 3], + [6, 40, 2, 2, 5], + [6, 80, 3, 2, 3], + [6, 112, 3, 1, 5], + [6, 192, 4, 2, 5], + [6, 320, 1, 1, 3], + ] + + self.backbone = self._build_b0() + + def _build_b0(self) -> nn.Sequential: + in_channels = 32 + layers = [ConvNorm(1, in_channels, 3, stride=2, padding=1)] + + for expand_ratio, out_channels, repeats, stride, kernel_size in self.base_model: + for i in range(repeats): + layers.append( + InvertedResidulaBlock( + in_channels, + out_channels, + expand_ratio=expand_ratio, + stride=stride if i == 0 else 1, + kernel_size=kernel_size, + padding=kernel_size // 2, + ) + ) + in_channels = out_channels + layers.append(ConvNorm(in_channels, 256, kernel_size=1, stride=1, padding=0)) + + return nn.Sequential(*layers) + + def forward(self, x: Tensor) -> Tensor: + return self.backbone(x) diff --git a/text_recognizer/networks/cnn_transformer.py b/text_recognizer/networks/cnn_transformer.py index e23a15d..d42c29d 100644 --- a/text_recognizer/networks/cnn_transformer.py +++ b/text_recognizer/networks/cnn_transformer.py @@ -33,8 +33,8 @@ NUM_WORD_PIECES = 1000 class CNNTransformer(nn.Module): def __init__( self, - input_shape: Sequence[int], - output_shape: Sequence[int], + input_dim: Sequence[int], + output_dims: Sequence[int], encoder: Union[DictConfig, Dict], vocab_size: Optional[int] = None, num_decoder_layers: int = 4, @@ -43,22 +43,29 @@ class CNNTransformer(nn.Module): expansion_dim: int = 1024, dropout_rate: float = 0.1, transformer_activation: str = "glu", + *args, + **kwargs, ) -> None: + super().__init__() self.vocab_size = ( NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size ) + self.pad_index = 3 # TODO: fix me self.hidden_dim = hidden_dim - self.max_output_length = output_shape[0] + self.max_output_length = output_dims[0] # Image backbone self.encoder = self._configure_encoder(encoder) + self.encoder_proj = nn.Conv2d(256, hidden_dim, kernel_size=1) self.feature_map_encoding = PositionalEncoding2D( - hidden_dim=hidden_dim, max_h=input_shape[1], max_w=input_shape[2] + hidden_dim=hidden_dim, max_h=input_dim[1], max_w=input_dim[2] ) # Target token embedding self.trg_embedding = nn.Embedding(self.vocab_size, hidden_dim) - self.trg_position_encoding = PositionalEncoding(hidden_dim, dropout_rate) + self.trg_position_encoding = PositionalEncoding( + hidden_dim, dropout_rate, max_len=output_dims[0] + ) # Transformer decoder self.decoder = Decoder( @@ -86,24 +93,25 @@ class CNNTransformer(nn.Module): self.head.weight.data.uniform_(-0.1, 0.1) nn.init.kaiming_normal_( - self.feature_map_encoding.weight.data, + self.encoder_proj.weight.data, a=0, mode="fan_out", nonlinearity="relu", ) - if self.feature_map_encoding.bias is not None: + if self.encoder_proj.bias is not None: _, fan_out = nn.init._calculate_fan_in_and_fan_out( - self.feature_map_encoding.weight.data + self.encoder_proj.weight.data ) bound = 1 / math.sqrt(fan_out) - nn.init.normal_(self.feature_map_encoding.bias, -bound, bound) + nn.init.normal_(self.encoder_proj.bias, -bound, bound) @staticmethod def _configure_encoder(encoder: Union[DictConfig, Dict]) -> Type[nn.Module]: encoder = OmegaConf.create(encoder) + args = encoder.args or {} network_module = importlib.import_module("text_recognizer.networks") encoder_class = getattr(network_module, encoder.type) - return encoder_class(**encoder.args) + return encoder_class(**args) def encode(self, image: Tensor) -> Tensor: """Extracts image features with backbone. @@ -121,6 +129,7 @@ class CNNTransformer(nn.Module): """ # Extract image features. image_features = self.encoder(image) + image_features = self.encoder_proj(image_features) # Add 2d encoding to the feature maps. image_features = self.feature_map_encoding(image_features) @@ -133,11 +142,19 @@ class CNNTransformer(nn.Module): """Decodes image features with transformer decoder.""" trg_mask = target_padding_mask(trg=trg, pad_index=self.pad_index) trg = self.trg_embedding(trg) * math.sqrt(self.hidden_dim) + trg = rearrange(trg, "b t d -> t b d") trg = self.trg_position_encoding(trg) + trg = rearrange(trg, "t b d -> b t d") out = self.decoder(trg=trg, memory=memory, trg_mask=trg_mask, memory_mask=None) logits = self.head(out) return logits + def forward(self, image: Tensor, trg: Tensor) -> Tensor: + image_features = self.encode(image) + output = self.decode(image_features, trg) + output = rearrange(output, "b t c -> b c t") + return output + def predict(self, image: Tensor) -> Tensor: """Transcribes text in image(s).""" bsz = image.shape[0] diff --git a/text_recognizer/networks/coat/__init__.py b/text_recognizer/networks/coat/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/text_recognizer/networks/coat/factor_attention.py b/text_recognizer/networks/coat/factor_attention.py deleted file mode 100644 index f91c5ef..0000000 --- a/text_recognizer/networks/coat/factor_attention.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Factorized attention with convolutional relative positional encodings.""" -from torch import nn - - -class FactorAttention(nn.Module): - """Factorized attention with relative positional encodings.""" - def __init__(self, dim: int, num_heads: int) -> None: - pass - diff --git a/text_recognizer/networks/coat/patch_embedding.py b/text_recognizer/networks/coat/patch_embedding.py deleted file mode 100644 index 3b7b76a..0000000 --- a/text_recognizer/networks/coat/patch_embedding.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Patch embedding for images and feature maps.""" -from typing import Sequence, Tuple - -from einops import rearrange -from loguru import logger -from torch import nn -from torch import Tensor - - -class PatchEmbedding(nn.Module): - """Patch embedding of images.""" - - def __init__( - self, - image_shape: Sequence[int], - patch_size: int = 16, - in_channels: int = 1, - embedding_dim: int = 512, - ) -> None: - if image_shape[0] % patch_size == 0 and image_shape[1] % patch_size == 0: - logger.error( - f"Image shape {image_shape} not divisable by patch size {patch_size}" - ) - - self.patch_size = patch_size - self.embedding = nn.Conv2d( - in_channels, embedding_dim, kernel_size=patch_size, stride=patch_size - ) - self.norm = nn.LayerNorm(embedding_dim) - - def forward(self, x: Tensor) -> Tuple[Tensor, Tuple[int, int]]: - """Embeds image or feature maps with patch embedding.""" - _, _, h, w = x.shape - h_out, w_out = h // self.patch_size, w // self.patch_size - x = self.embedding(x) - x = rearrange(x, "b c h w -> b (h w) c") - x = self.norm(x) - return x, (h_out, w_out) diff --git a/text_recognizer/networks/coat/positional_encodings.py b/text_recognizer/networks/coat/positional_encodings.py deleted file mode 100644 index 925db04..0000000 --- a/text_recognizer/networks/coat/positional_encodings.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Positional encodings for input sequence to transformer.""" -from typing import Dict, Union, Tuple - -from einops import rearrange -from loguru import logger -import torch -from torch import nn -from torch import Tensor - - -class RelativeEncoding(nn.Module): - """Relative positional encoding.""" - def __init__(self, channels: int, heads: int, windows: Union[int, Dict[int, int]]) -> None: - super().__init__() - self.windows = {windows: heads} if isinstance(windows, int) else windows - self.heads = list(self.windows.values()) - self.channel_heads = [head * channels for head in self.heads] - self.convs = nn.ModuleList([ - nn.Conv2d(in_channels=head * channels, - out_channels=head * channels, - kernel_shape=window, - padding=window // 2, - dilation=1, - groups=head * channels, - ) for window, head in self.windows.items()]) - - def forward(self, q: Tensor, v: Tensor, shape: Tuple[int, int]) -> Tensor: - """Applies relative positional encoding.""" - b, heads, hw, c = q.shape - h, w = shape - if hw != h * w: - logger.exception(f"Query width {hw} neq to height x width {h * w}") - raise ValueError - - v = rearrange(v, "b heads (h w) c -> b (heads c) h w", h=h, w=w) - v = torch.split(v, self.channel_heads, dim=1) - v = [conv(x) for conv, x in zip(self.convs, v)] - v = torch.cat(v, dim=1) - v = rearrange(v, "b (heads c) h w -> b heads (h w) c", heads=heads) - - encoding = q * v - zeros = torch.zeros((b, heads, 1, c), dtype=q.dtype, layout=q.layout, device=q.device) - encoding = torch.cat((zeros, encoding), dim=2) - return encoding - - -class PositionalEncoding(nn.Module): - """Convolutional positional encoding.""" - def __init__(self, dim: int, k: int = 3) -> None: - super().__init__() - self.encode = nn.Conv2d(in_channels=dim, out_channels=dim, kernel_size=k, stride=1, padding=k//2, groups=dim) - - def forward(self, x: Tensor, shape: Tuple[int, int]) -> Tensor: - """Applies convolutional encoding.""" - _, hw, _ = x.shape - h, w = shape - - if hw != h * w: - logger.exception(f"Query width {hw} neq to height x width {h * w}") - raise ValueError - - # Depthwise convolution. - x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w) - x = self.encode(x) + x - x = rearrange(x, "b c h w -> b (h w) c") - return x - - - - - - - - - - diff --git a/text_recognizer/networks/residual_network.py b/text_recognizer/networks/residual_network.py index da7553d..c33f419 100644 --- a/text_recognizer/networks/residual_network.py +++ b/text_recognizer/networks/residual_network.py @@ -20,11 +20,7 @@ class Conv2dAuto(nn.Conv2d): def conv_bn(in_channels: int, out_channels: int, *args, **kwargs) -> nn.Sequential: """3x3 convolution with batch norm.""" - conv3x3 = partial( - Conv2dAuto, - kernel_size=3, - bias=False, - ) + conv3x3 = partial(Conv2dAuto, kernel_size=3, bias=False,) return nn.Sequential( conv3x3(in_channels, out_channels, *args, **kwargs), nn.BatchNorm2d(out_channels), diff --git a/text_recognizer/networks/transducer/transducer.py b/text_recognizer/networks/transducer/transducer.py index b10f93a..d7e3d08 100644 --- a/text_recognizer/networks/transducer/transducer.py +++ b/text_recognizer/networks/transducer/transducer.py @@ -392,12 +392,7 @@ def load_transducer_loss( transitions = gtn.load(str(processed_path / transitions)) preprocessor = Preprocessor( - data_dir, - num_features, - tokens_path, - lexicon_path, - use_words, - prepend_wordsep, + data_dir, num_features, tokens_path, lexicon_path, use_words, prepend_wordsep, ) num_tokens = preprocessor.num_tokens diff --git a/text_recognizer/networks/transformer/positional_encoding.py b/text_recognizer/networks/transformer/positional_encoding.py index 5874e97..c50afc3 100644 --- a/text_recognizer/networks/transformer/positional_encoding.py +++ b/text_recognizer/networks/transformer/positional_encoding.py @@ -33,7 +33,10 @@ class PositionalEncoding(nn.Module): def forward(self, x: Tensor) -> Tensor: """Encodes the tensor with a postional embedding.""" - x = x + self.pe[:, : x.shape[1]] + # [T, B, D] + if x.shape[2] != self.pe.shape[2]: + raise ValueError(f"x shape does not match pe in the 3rd dim.") + x = x + self.pe[: x.shape[0]] return self.dropout(x) @@ -48,6 +51,7 @@ class PositionalEncoding2D(nn.Module): pe = self.make_pe(hidden_dim, max_h, max_w) self.register_buffer("pe", pe) + @staticmethod def make_pe(hidden_dim: int, max_h: int, max_w: int) -> Tensor: """Returns 2d postional encoding.""" pe_h = PositionalEncoding.make_pe( diff --git a/text_recognizer/networks/transformer/rotary_embedding.py b/text_recognizer/networks/transformer/rotary_embedding.py new file mode 100644 index 0000000..5e80572 --- /dev/null +++ b/text_recognizer/networks/transformer/rotary_embedding.py @@ -0,0 +1,39 @@ +"""Roatary embedding. + +Stolen from lucidrains: + https://github.com/lucidrains/x-transformers/blob/main/x_transformers/x_transformers.py + +Explanation of roatary: + https://blog.eleuther.ai/rotary-embeddings/ + +""" +from typing import Tuple + +from einops import rearrange +import torch +from torch import nn +from torch import Tensor + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim: int): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, x: Tensor, seq_dim: int = 1) -> Tensor: + t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + freqs = torch.einsum("i , j -> i j", t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + return emb[None, :, :] + + +def rotate_half(x: Tensor) -> Tensor: + x = rearrange(x, "... (j d) -> ... j d", j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q: Tensor, k: Tensor, freqs: Tensor) -> Tuple[Tensor, Tensor]: + q, k = map(lambda t: (t * freqs.cos()) + (rotate_half(t) * freqs.sin()), (q, k)) + return q, k diff --git a/text_recognizer/networks/vqvae/decoder.py b/text_recognizer/networks/vqvae/decoder.py index 93a1e43..32de912 100644 --- a/text_recognizer/networks/vqvae/decoder.py +++ b/text_recognizer/networks/vqvae/decoder.py @@ -44,12 +44,7 @@ class Decoder(nn.Module): # Configure encoder. self.decoder = self._build_decoder( - channels, - kernel_sizes, - strides, - num_residual_layers, - activation, - dropout, + channels, kernel_sizes, strides, num_residual_layers, activation, dropout, ) def _build_decompression_block( @@ -78,9 +73,7 @@ class Decoder(nn.Module): ) if self.upsampling and i < len(self.upsampling): - modules.append( - nn.Upsample(size=self.upsampling[i]), - ) + modules.append(nn.Upsample(size=self.upsampling[i]),) if dropout is not None: modules.append(dropout) @@ -109,12 +102,7 @@ class Decoder(nn.Module): ) -> nn.Sequential: self.res_block.append( - nn.Conv2d( - self.embedding_dim, - channels[0], - kernel_size=1, - stride=1, - ) + nn.Conv2d(self.embedding_dim, channels[0], kernel_size=1, stride=1,) ) # Bottleneck module. diff --git a/text_recognizer/networks/vqvae/encoder.py b/text_recognizer/networks/vqvae/encoder.py index b0cceed..65801df 100644 --- a/text_recognizer/networks/vqvae/encoder.py +++ b/text_recognizer/networks/vqvae/encoder.py @@ -11,10 +11,7 @@ from text_recognizer.networks.vqvae.vector_quantizer import VectorQuantizer class _ResidualBlock(nn.Module): def __init__( - self, - in_channels: int, - out_channels: int, - dropout: Optional[Type[nn.Module]], + self, in_channels: int, out_channels: int, dropout: Optional[Type[nn.Module]], ) -> None: super().__init__() self.block = [ @@ -138,12 +135,7 @@ class Encoder(nn.Module): ) encoder.append( - nn.Conv2d( - channels[-1], - self.embedding_dim, - kernel_size=1, - stride=1, - ) + nn.Conv2d(channels[-1], self.embedding_dim, kernel_size=1, stride=1,) ) return nn.Sequential(*encoder) diff --git a/training/configs/cnn_transformer.yaml b/training/configs/cnn_transformer.yaml new file mode 100644 index 0000000..a4f16df --- /dev/null +++ b/training/configs/cnn_transformer.yaml @@ -0,0 +1,90 @@ +seed: 4711 + +network: + desc: Configuration of the PyTorch neural network. + type: CNNTransformer + args: + encoder: + type: EfficientNet + args: null + num_decoder_layers: 4 + vocab_size: 84 + hidden_dim: 256 + num_heads: 4 + expansion_dim: 1024 + dropout_rate: 0.1 + transformer_activation: glu + +model: + desc: Configuration of the PyTorch Lightning model. + type: LitTransformerModel + args: + optimizer: + type: MADGRAD + args: + lr: 1.0e-3 + momentum: 0.9 + weight_decay: 0 + eps: 1.0e-6 + lr_scheduler: + type: OneCycleLR + args: + interval: &interval step + max_lr: 1.0e-3 + three_phase: true + epochs: 512 + steps_per_epoch: 1246 # num_samples / batch_size + criterion: + type: CrossEntropyLoss + args: + weight: null + ignore_index: -100 + reduction: mean + monitor: val_loss + mapping: sentence_piece + +data: + desc: Configuration of the training/test data. + type: IAMExtendedParagraphs + args: + batch_size: 8 + num_workers: 12 + train_fraction: 0.8 + augment: true + +callbacks: + - type: ModelCheckpoint + args: + monitor: val_loss + mode: min + save_last: true + # - type: StochasticWeightAveraging + # args: + # swa_epoch_start: 0.8 + # swa_lrs: 0.05 + # annealing_epochs: 10 + # annealing_strategy: cos + # device: null + - type: LearningRateMonitor + args: + logging_interval: *interval + # - type: EarlyStopping + # args: + # monitor: val_loss + # mode: min + # patience: 10 + +trainer: + desc: Configuration of the PyTorch Lightning Trainer. + args: + stochastic_weight_avg: false + auto_scale_batch_size: binsearch + gradient_clip_val: 0 + fast_dev_run: true + gpus: 1 + precision: 16 + max_epochs: 512 + terminate_on_nan: true + weights_summary: top + +load_checkpoint: null diff --git a/training/configs/image_transformer.yaml b/training/configs/image_transformer.yaml deleted file mode 100644 index e6637f2..0000000 --- a/training/configs/image_transformer.yaml +++ /dev/null @@ -1,89 +0,0 @@ -seed: 4711 - -network: - desc: Configuration of the PyTorch neural network. - type: ImageTransformer - args: - encoder: - type: null - args: null - num_decoder_layers: 4 - hidden_dim: 256 - num_heads: 4 - expansion_dim: 1024 - dropout_rate: 0.1 - transformer_activation: glu - -model: - desc: Configuration of the PyTorch Lightning model. - type: LitTransformerModel - args: - optimizer: - type: MADGRAD - args: - lr: 1.0e-3 - momentum: 0.9 - weight_decay: 0 - eps: 1.0e-6 - lr_scheduler: - type: OneCycle - args: - interval: &interval step - max_lr: 1.0e-3 - three_phase: true - epochs: 512 - steps_per_epoch: 1246 # num_samples / batch_size - criterion: - type: CrossEntropyLoss - args: - weight: None - ignore_index: -100 - reduction: mean - monitor: val_loss - mapping: sentence_piece - -data: - desc: Configuration of the training/test data. - type: IAMExtendedParagraphs - args: - batch_size: 16 - num_workers: 12 - train_fraction: 0.8 - augment: true - -callbacks: - - type: ModelCheckpoint - args: - monitor: val_loss - mode: min - save_last: true - - type: StochasticWeightAveraging - args: - swa_epoch_start: 0.8 - swa_lrs: 0.05 - annealing_epochs: 10 - annealing_strategy: cos - device: null - - type: LearningRateMonitor - args: - logging_interval: *interval - - type: EarlyStopping - args: - monitor: val_loss - mode: min - patience: 10 - -trainer: - desc: Configuration of the PyTorch Lightning Trainer. - args: - stochastic_weight_avg: true - auto_scale_batch_size: binsearch - gradient_clip_val: 0 - fast_dev_run: false - gpus: 1 - precision: 16 - max_epochs: 512 - terminate_on_nan: true - weights_summary: true - -load_checkpoint: null diff --git a/training/configs/vqvae.yaml b/training/configs/vqvae.yaml index a7acb3a..13d7c97 100644 --- a/training/configs/vqvae.yaml +++ b/training/configs/vqvae.yaml @@ -5,12 +5,12 @@ network: type: VQVAE args: in_channels: 1 - channels: [32, 64, 64] - kernel_sizes: [4, 4, 4] - strides: [2, 2, 2] + channels: [32, 64, 64, 96, 96] + kernel_sizes: [4, 4, 4, 4, 4] + strides: [2, 2, 2, 2, 2] num_residual_layers: 2 - embedding_dim: 128 - num_embeddings: 512 + embedding_dim: 512 + num_embeddings: 1024 upsampling: null beta: 0.25 activation: leaky_relu -- cgit v1.2.3-70-g09d2