diff options
Diffstat (limited to 'notebooks/00-testing-stuff-out.ipynb')
| -rw-r--r-- | notebooks/00-testing-stuff-out.ipynb | 555 | 
1 files changed, 147 insertions, 408 deletions
diff --git a/notebooks/00-testing-stuff-out.ipynb b/notebooks/00-testing-stuff-out.ipynb index e6cf099..7c7b3a6 100644 --- a/notebooks/00-testing-stuff-out.ipynb +++ b/notebooks/00-testing-stuff-out.ipynb @@ -2,7 +2,7 @@   "cells": [    {     "cell_type": "code", -   "execution_count": 4, +   "execution_count": 1,     "metadata": {},     "outputs": [],     "source": [ @@ -25,7 +25,7 @@    },    {     "cell_type": "code", -   "execution_count": 16, +   "execution_count": 2,     "metadata": {},     "outputs": [],     "source": [ @@ -34,7 +34,7 @@    },    {     "cell_type": "code", -   "execution_count": 17, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ @@ -43,7 +43,7 @@    },    {     "cell_type": "code", -   "execution_count": 18, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ @@ -52,110 +52,16 @@    },    {     "cell_type": "code", -   "execution_count": 19, +   "execution_count": null,     "metadata": {}, -   "outputs": [ -    { -     "name": "stdout", -     "output_type": "stream", -     "text": [ -      "seed: 4711\n", -      "network:\n", -      "  desc: Configuration of the PyTorch neural network.\n", -      "  type: VQVAE\n", -      "  args:\n", -      "    in_channels: 1\n", -      "    channels:\n", -      "    - 32\n", -      "    - 64\n", -      "    - 96\n", -      "    - 96\n", -      "    - 128\n", -      "    kernel_sizes:\n", -      "    - 4\n", -      "    - 4\n", -      "    - 4\n", -      "    - 4\n", -      "    - 4\n", -      "    strides:\n", -      "    - 2\n", -      "    - 2\n", -      "    - 2\n", -      "    - 2\n", -      "    - 2\n", -      "    num_residual_layers: 2\n", -      "    embedding_dim: 128\n", -      "    num_embeddings: 1024\n", -      "    upsampling: null\n", -      "    beta: 0.25\n", -      "    activation: leaky_relu\n", -      "    dropout_rate: 0.1\n", -      "model:\n", -      "  desc: Configuration of the PyTorch Lightning model.\n", -      "  type: LitVQVAEModel\n", -      "  args:\n", -      "    optimizer:\n", -      "      type: MADGRAD\n", -      "      args:\n", -      "        lr: 0.001\n", -      "        momentum: 0.9\n", -      "        weight_decay: 0\n", -      "        eps: 1.0e-06\n", -      "    lr_scheduler:\n", -      "      type: OneCycleLR\n", -      "      args:\n", -      "        interval: step\n", -      "        max_lr: 0.001\n", -      "        three_phase: true\n", -      "        epochs: 1024\n", -      "        steps_per_epoch: 317\n", -      "    criterion:\n", -      "      type: MSELoss\n", -      "      args:\n", -      "        reduction: mean\n", -      "    monitor: val_loss\n", -      "    mapping: sentence_piece\n", -      "data:\n", -      "  desc: Configuration of the training/test data.\n", -      "  type: IAMExtendedParagraphs\n", -      "  args:\n", -      "    batch_size: 64\n", -      "    num_workers: 12\n", -      "    train_fraction: 0.8\n", -      "    augment: true\n", -      "callbacks:\n", -      "- type: ModelCheckpoint\n", -      "  args:\n", -      "    monitor: val_loss\n", -      "    mode: min\n", -      "    save_last: true\n", -      "- type: LearningRateMonitor\n", -      "  args:\n", -      "    logging_interval: step\n", -      "trainer:\n", -      "  desc: Configuration of the PyTorch Lightning Trainer.\n", -      "  args:\n", -      "    stochastic_weight_avg: false\n", -      "    auto_scale_batch_size: binsearch\n", -      "    gradient_clip_val: 0\n", -      "    fast_dev_run: false\n", -      "    gpus: 1\n", -      "    precision: 16\n", -      "    max_epochs: 1024\n", -      "    terminate_on_nan: true\n", -      "    weights_summary: full\n", -      "load_checkpoint: null\n", -      "\n" -     ] -    } -   ], +   "outputs": [],     "source": [      "print(OmegaConf.to_yaml(conf))"     ]    },    {     "cell_type": "code", -   "execution_count": 20, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ @@ -164,7 +70,7 @@    },    {     "cell_type": "code", -   "execution_count": 21, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ @@ -173,167 +79,16 @@    },    {     "cell_type": "code", -   "execution_count": 22, +   "execution_count": null,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "VQVAE(\n", -       "  (encoder): Encoder(\n", -       "    (encoder): Sequential(\n", -       "      (0): Sequential(\n", -       "        (0): Conv2d(1, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (1): Dropout(p=0.1, inplace=False)\n", -       "      (2): Sequential(\n", -       "        (0): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (3): Dropout(p=0.1, inplace=False)\n", -       "      (4): Sequential(\n", -       "        (0): Conv2d(64, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (5): Dropout(p=0.1, inplace=False)\n", -       "      (6): Sequential(\n", -       "        (0): Conv2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (7): Dropout(p=0.1, inplace=False)\n", -       "      (8): Sequential(\n", -       "        (0): Conv2d(96, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (9): Dropout(p=0.1, inplace=False)\n", -       "      (10): _ResidualBlock(\n", -       "        (block): Sequential(\n", -       "          (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", -       "          (1): ReLU(inplace=True)\n", -       "          (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", -       "          (3): Dropout(p=0.1, inplace=False)\n", -       "        )\n", -       "      )\n", -       "      (11): _ResidualBlock(\n", -       "        (block): Sequential(\n", -       "          (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", -       "          (1): ReLU(inplace=True)\n", -       "          (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", -       "          (3): Dropout(p=0.1, inplace=False)\n", -       "        )\n", -       "      )\n", -       "      (12): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n", -       "    )\n", -       "    (vector_quantizer): VectorQuantizer(\n", -       "      (embedding): Embedding(1024, 128)\n", -       "    )\n", -       "  )\n", -       "  (decoder): Decoder(\n", -       "    (res_block): Sequential(\n", -       "      (0): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n", -       "      (1): _ResidualBlock(\n", -       "        (block): Sequential(\n", -       "          (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", -       "          (1): ReLU(inplace=True)\n", -       "          (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", -       "          (3): Dropout(p=0.1, inplace=False)\n", -       "        )\n", -       "      )\n", -       "      (2): _ResidualBlock(\n", -       "        (block): Sequential(\n", -       "          (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", -       "          (1): ReLU(inplace=True)\n", -       "          (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", -       "          (3): Dropout(p=0.1, inplace=False)\n", -       "        )\n", -       "      )\n", -       "    )\n", -       "    (upsampling_block): Sequential(\n", -       "      (0): Sequential(\n", -       "        (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (1): Dropout(p=0.1, inplace=False)\n", -       "      (2): Sequential(\n", -       "        (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (3): Dropout(p=0.1, inplace=False)\n", -       "      (4): Sequential(\n", -       "        (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (5): Dropout(p=0.1, inplace=False)\n", -       "      (6): Sequential(\n", -       "        (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "      )\n", -       "      (7): Dropout(p=0.1, inplace=False)\n", -       "      (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "      (9): Tanh()\n", -       "    )\n", -       "    (decoder): Sequential(\n", -       "      (0): Sequential(\n", -       "        (0): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n", -       "        (1): _ResidualBlock(\n", -       "          (block): Sequential(\n", -       "            (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", -       "            (1): ReLU(inplace=True)\n", -       "            (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", -       "            (3): Dropout(p=0.1, inplace=False)\n", -       "          )\n", -       "        )\n", -       "        (2): _ResidualBlock(\n", -       "          (block): Sequential(\n", -       "            (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", -       "            (1): ReLU(inplace=True)\n", -       "            (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", -       "            (3): Dropout(p=0.1, inplace=False)\n", -       "          )\n", -       "        )\n", -       "      )\n", -       "      (1): Sequential(\n", -       "        (0): Sequential(\n", -       "          (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "          (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "        )\n", -       "        (1): Dropout(p=0.1, inplace=False)\n", -       "        (2): Sequential(\n", -       "          (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "          (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "        )\n", -       "        (3): Dropout(p=0.1, inplace=False)\n", -       "        (4): Sequential(\n", -       "          (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "          (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "        )\n", -       "        (5): Dropout(p=0.1, inplace=False)\n", -       "        (6): Sequential(\n", -       "          (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "          (1): LeakyReLU(negative_slope=0.01, inplace=True)\n", -       "        )\n", -       "        (7): Dropout(p=0.1, inplace=False)\n", -       "        (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n", -       "        (9): Tanh()\n", -       "      )\n", -       "    )\n", -       "  )\n", -       ")" -      ] -     }, -     "execution_count": 22, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [      "vae"     ]    },    {     "cell_type": "code", -   "execution_count": 5, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ @@ -342,275 +97,259 @@    },    {     "cell_type": "code", -   "execution_count": 6, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ -    "proj = nn.Conv2d(1, 32, kernel_size=16, stride=16)" +    "vae.encoder(datum)[0].shape"     ]    },    {     "cell_type": "code", -   "execution_count": 7, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ -    "x = proj(datum)" +    "vae(datum)[0].shape"     ]    },    {     "cell_type": "code", -   "execution_count": 8, +   "execution_count": null,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "torch.Size([2, 32, 36, 40])" -      ] -     }, -     "execution_count": 8, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [ -    "x.shape" +    "from text_recognizer.networks.backbones.efficientnet import EfficientNet"     ]    },    {     "cell_type": "code", -   "execution_count": 9, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ -    "xx = x.flatten(2)" +    "en = EfficientNet()"     ]    },    {     "cell_type": "code", -   "execution_count": 10, +   "execution_count": 3,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "torch.Size([2, 32, 1440])" -      ] -     }, -     "execution_count": 10, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [ -    "xx.shape" +    "datum = torch.randn([2, 1, 576, 640])"     ]    },    {     "cell_type": "code", -   "execution_count": 11, +   "execution_count": 4,     "metadata": {},     "outputs": [],     "source": [ -    "xxx = xx.transpose(1,2)" +    "trg = torch.randint(0, 1000, [2, 682])"     ]    },    {     "cell_type": "code", -   "execution_count": 12, +   "execution_count": null,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "torch.Size([2, 1440, 32])" -      ] -     }, -     "execution_count": 12, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [ -    "xxx.shape" +    "trg.shape"     ]    },    {     "cell_type": "code", -   "execution_count": 13, +   "execution_count": null,     "metadata": {},     "outputs": [],     "source": [ -    "from einops import rearrange" +    "datum = torch.randn([2, 1, 224, 224])"     ]    },    {     "cell_type": "code", -   "execution_count": 14, -   "metadata": {}, +   "execution_count": null, +   "metadata": { +    "scrolled": false +   },     "outputs": [],     "source": [ -    "xxxx = rearrange(x, \"b c h w -> b ( h w ) c\")" +    "en(datum).shape"     ]    },    {     "cell_type": "code", -   "execution_count": 15, +   "execution_count": 5,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "torch.Size([2, 1440, 32])" -      ] -     }, -     "execution_count": 15, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [ -    "xxxx.shape" +    "path = \"../training/configs/cnn_transformer.yaml\""     ]    },    {     "cell_type": "code", -   "execution_count": null, +   "execution_count": 6,     "metadata": {},     "outputs": [],     "source": [ -    "        B, N, C = x.shape\n", -    "        H, W = size\n", -    "        assert N == 1 + H * W\n", -    "\n", -    "        # Extract CLS token and image tokens.\n", -    "        cls_token, img_tokens = x[:, :1], x[:, 1:]                                       # Shape: [B, 1, C], [B, H*W, C].\n", -    "        \n", -    "        # Depthwise convolution.\n", -    "        feat = img_tokens.transpose(1, 2).view(B, C, H, W)" +    "conf = OmegaConf.load(path)"     ]    },    {     "cell_type": "code", -   "execution_count": 22, +   "execution_count": 7,     "metadata": {},     "outputs": [      { -     "data": { -      "text/plain": [ -       "torch.Size([2, 32, 36, 40])" -      ] -     }, -     "execution_count": 22, -     "metadata": {}, -     "output_type": "execute_result" +     "name": "stdout", +     "output_type": "stream", +     "text": [ +      "seed: 4711\n", +      "network:\n", +      "  desc: Configuration of the PyTorch neural network.\n", +      "  type: CNNTransformer\n", +      "  args:\n", +      "    encoder:\n", +      "      type: EfficientNet\n", +      "      args: null\n", +      "    num_decoder_layers: 4\n", +      "    hidden_dim: 256\n", +      "    num_heads: 4\n", +      "    expansion_dim: 1024\n", +      "    dropout_rate: 0.1\n", +      "    transformer_activation: glu\n", +      "model:\n", +      "  desc: Configuration of the PyTorch Lightning model.\n", +      "  type: LitTransformerModel\n", +      "  args:\n", +      "    optimizer:\n", +      "      type: MADGRAD\n", +      "      args:\n", +      "        lr: 0.001\n", +      "        momentum: 0.9\n", +      "        weight_decay: 0\n", +      "        eps: 1.0e-06\n", +      "    lr_scheduler:\n", +      "      type: OneCycleLR\n", +      "      args:\n", +      "        interval: step\n", +      "        max_lr: 0.001\n", +      "        three_phase: true\n", +      "        epochs: 512\n", +      "        steps_per_epoch: 1246\n", +      "    criterion:\n", +      "      type: CrossEntropyLoss\n", +      "      args:\n", +      "        weight: None\n", +      "        ignore_index: -100\n", +      "        reduction: mean\n", +      "    monitor: val_loss\n", +      "    mapping: sentence_piece\n", +      "data:\n", +      "  desc: Configuration of the training/test data.\n", +      "  type: IAMExtendedParagraphs\n", +      "  args:\n", +      "    batch_size: 16\n", +      "    num_workers: 12\n", +      "    train_fraction: 0.8\n", +      "    augment: true\n", +      "callbacks:\n", +      "- type: ModelCheckpoint\n", +      "  args:\n", +      "    monitor: val_loss\n", +      "    mode: min\n", +      "    save_last: true\n", +      "- type: StochasticWeightAveraging\n", +      "  args:\n", +      "    swa_epoch_start: 0.8\n", +      "    swa_lrs: 0.05\n", +      "    annealing_epochs: 10\n", +      "    annealing_strategy: cos\n", +      "    device: null\n", +      "- type: LearningRateMonitor\n", +      "  args:\n", +      "    logging_interval: step\n", +      "- type: EarlyStopping\n", +      "  args:\n", +      "    monitor: val_loss\n", +      "    mode: min\n", +      "    patience: 10\n", +      "trainer:\n", +      "  desc: Configuration of the PyTorch Lightning Trainer.\n", +      "  args:\n", +      "    stochastic_weight_avg: true\n", +      "    auto_scale_batch_size: binsearch\n", +      "    gradient_clip_val: 0\n", +      "    fast_dev_run: false\n", +      "    gpus: 1\n", +      "    precision: 16\n", +      "    max_epochs: 512\n", +      "    terminate_on_nan: true\n", +      "    weights_summary: true\n", +      "load_checkpoint: null\n", +      "\n" +     ]      }     ],     "source": [ -    "xxx.transpose(1, 2).view(2, 32, 36, 40).shape" +    "print(OmegaConf.to_yaml(conf))"     ]    },    {     "cell_type": "code", -   "execution_count": 18, -   "metadata": { -    "scrolled": true -   }, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "72.0" -      ] -     }, -     "execution_count": 18, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "execution_count": 8, +   "metadata": {}, +   "outputs": [],     "source": [ -    "576 / 8" +    "from text_recognizer.networks.cnn_transformer import CNNTransformer"     ]    },    {     "cell_type": "code", -   "execution_count": 19, +   "execution_count": 9,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "80.0" -      ] -     }, -     "execution_count": 19, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [ -    "640 / 8" +    "t = CNNTransformer(input_shape=(1, 576, 640), output_shape=(682, 1), **conf.network.args)"     ]    },    {     "cell_type": "code", -   "execution_count": 26, +   "execution_count": null,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "torch.Size([2, 1, 576, 640])" -      ] -     }, -     "execution_count": 26, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [ -    "datum.shape" +    "t.encode(datum).shape"     ]    },    {     "cell_type": "code", -   "execution_count": 27, +   "execution_count": null,     "metadata": {}, -   "outputs": [ -    { -     "data": { -      "text/plain": [ -       "torch.Size([2, 128, 18, 20])" -      ] -     }, -     "execution_count": 27, -     "metadata": {}, -     "output_type": "execute_result" -    } -   ], +   "outputs": [],     "source": [ -    "vae.encoder(datum)[0].shape" +    "trg.shape"     ]    },    {     "cell_type": "code", -   "execution_count": 87, +   "execution_count": 10,     "metadata": {},     "outputs": [      {       "data": {        "text/plain": [ -       "torch.Size([2, 1, 576, 640])" +       "torch.Size([2, 682, 1004])"        ]       }, -     "execution_count": 87, +     "execution_count": 10,       "metadata": {},       "output_type": "execute_result"      }     ],     "source": [ -    "vae(datum)[0].shape" +    "t(datum, trg).shape"     ]    },    {  |