summaryrefslogtreecommitdiff
path: root/notebooks/00-testing-stuff-out.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'notebooks/00-testing-stuff-out.ipynb')
-rw-r--r--notebooks/00-testing-stuff-out.ipynb555
1 files changed, 147 insertions, 408 deletions
diff --git a/notebooks/00-testing-stuff-out.ipynb b/notebooks/00-testing-stuff-out.ipynb
index e6cf099..7c7b3a6 100644
--- a/notebooks/00-testing-stuff-out.ipynb
+++ b/notebooks/00-testing-stuff-out.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -52,110 +52,16 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "seed: 4711\n",
- "network:\n",
- " desc: Configuration of the PyTorch neural network.\n",
- " type: VQVAE\n",
- " args:\n",
- " in_channels: 1\n",
- " channels:\n",
- " - 32\n",
- " - 64\n",
- " - 96\n",
- " - 96\n",
- " - 128\n",
- " kernel_sizes:\n",
- " - 4\n",
- " - 4\n",
- " - 4\n",
- " - 4\n",
- " - 4\n",
- " strides:\n",
- " - 2\n",
- " - 2\n",
- " - 2\n",
- " - 2\n",
- " - 2\n",
- " num_residual_layers: 2\n",
- " embedding_dim: 128\n",
- " num_embeddings: 1024\n",
- " upsampling: null\n",
- " beta: 0.25\n",
- " activation: leaky_relu\n",
- " dropout_rate: 0.1\n",
- "model:\n",
- " desc: Configuration of the PyTorch Lightning model.\n",
- " type: LitVQVAEModel\n",
- " args:\n",
- " optimizer:\n",
- " type: MADGRAD\n",
- " args:\n",
- " lr: 0.001\n",
- " momentum: 0.9\n",
- " weight_decay: 0\n",
- " eps: 1.0e-06\n",
- " lr_scheduler:\n",
- " type: OneCycleLR\n",
- " args:\n",
- " interval: step\n",
- " max_lr: 0.001\n",
- " three_phase: true\n",
- " epochs: 1024\n",
- " steps_per_epoch: 317\n",
- " criterion:\n",
- " type: MSELoss\n",
- " args:\n",
- " reduction: mean\n",
- " monitor: val_loss\n",
- " mapping: sentence_piece\n",
- "data:\n",
- " desc: Configuration of the training/test data.\n",
- " type: IAMExtendedParagraphs\n",
- " args:\n",
- " batch_size: 64\n",
- " num_workers: 12\n",
- " train_fraction: 0.8\n",
- " augment: true\n",
- "callbacks:\n",
- "- type: ModelCheckpoint\n",
- " args:\n",
- " monitor: val_loss\n",
- " mode: min\n",
- " save_last: true\n",
- "- type: LearningRateMonitor\n",
- " args:\n",
- " logging_interval: step\n",
- "trainer:\n",
- " desc: Configuration of the PyTorch Lightning Trainer.\n",
- " args:\n",
- " stochastic_weight_avg: false\n",
- " auto_scale_batch_size: binsearch\n",
- " gradient_clip_val: 0\n",
- " fast_dev_run: false\n",
- " gpus: 1\n",
- " precision: 16\n",
- " max_epochs: 1024\n",
- " terminate_on_nan: true\n",
- " weights_summary: full\n",
- "load_checkpoint: null\n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"print(OmegaConf.to_yaml(conf))"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -164,7 +70,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -173,167 +79,16 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "VQVAE(\n",
- " (encoder): Encoder(\n",
- " (encoder): Sequential(\n",
- " (0): Sequential(\n",
- " (0): Conv2d(1, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (1): Dropout(p=0.1, inplace=False)\n",
- " (2): Sequential(\n",
- " (0): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " (4): Sequential(\n",
- " (0): Conv2d(64, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (5): Dropout(p=0.1, inplace=False)\n",
- " (6): Sequential(\n",
- " (0): Conv2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (7): Dropout(p=0.1, inplace=False)\n",
- " (8): Sequential(\n",
- " (0): Conv2d(96, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (9): Dropout(p=0.1, inplace=False)\n",
- " (10): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " )\n",
- " )\n",
- " (11): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " )\n",
- " )\n",
- " (12): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n",
- " )\n",
- " (vector_quantizer): VectorQuantizer(\n",
- " (embedding): Embedding(1024, 128)\n",
- " )\n",
- " )\n",
- " (decoder): Decoder(\n",
- " (res_block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n",
- " (1): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " )\n",
- " )\n",
- " (2): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " )\n",
- " )\n",
- " )\n",
- " (upsampling_block): Sequential(\n",
- " (0): Sequential(\n",
- " (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (1): Dropout(p=0.1, inplace=False)\n",
- " (2): Sequential(\n",
- " (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " (4): Sequential(\n",
- " (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (5): Dropout(p=0.1, inplace=False)\n",
- " (6): Sequential(\n",
- " (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (7): Dropout(p=0.1, inplace=False)\n",
- " (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (9): Tanh()\n",
- " )\n",
- " (decoder): Sequential(\n",
- " (0): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n",
- " (1): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " )\n",
- " )\n",
- " (2): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " )\n",
- " )\n",
- " )\n",
- " (1): Sequential(\n",
- " (0): Sequential(\n",
- " (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (1): Dropout(p=0.1, inplace=False)\n",
- " (2): Sequential(\n",
- " (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (3): Dropout(p=0.1, inplace=False)\n",
- " (4): Sequential(\n",
- " (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (5): Dropout(p=0.1, inplace=False)\n",
- " (6): Sequential(\n",
- " (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
- " )\n",
- " (7): Dropout(p=0.1, inplace=False)\n",
- " (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (9): Tanh()\n",
- " )\n",
- " )\n",
- " )\n",
- ")"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"vae"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -342,275 +97,259 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "proj = nn.Conv2d(1, 32, kernel_size=16, stride=16)"
+ "vae.encoder(datum)[0].shape"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "x = proj(datum)"
+ "vae(datum)[0].shape"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "torch.Size([2, 32, 36, 40])"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "x.shape"
+ "from text_recognizer.networks.backbones.efficientnet import EfficientNet"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "xx = x.flatten(2)"
+ "en = EfficientNet()"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 3,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "torch.Size([2, 32, 1440])"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "xx.shape"
+ "datum = torch.randn([2, 1, 576, 640])"
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
- "xxx = xx.transpose(1,2)"
+ "trg = torch.randint(0, 1000, [2, 682])"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "torch.Size([2, 1440, 32])"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "xxx.shape"
+ "trg.shape"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "from einops import rearrange"
+ "datum = torch.randn([2, 1, 224, 224])"
]
},
{
"cell_type": "code",
- "execution_count": 14,
- "metadata": {},
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
"outputs": [],
"source": [
- "xxxx = rearrange(x, \"b c h w -> b ( h w ) c\")"
+ "en(datum).shape"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 5,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "torch.Size([2, 1440, 32])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "xxxx.shape"
+ "path = \"../training/configs/cnn_transformer.yaml\""
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- " B, N, C = x.shape\n",
- " H, W = size\n",
- " assert N == 1 + H * W\n",
- "\n",
- " # Extract CLS token and image tokens.\n",
- " cls_token, img_tokens = x[:, :1], x[:, 1:] # Shape: [B, 1, C], [B, H*W, C].\n",
- " \n",
- " # Depthwise convolution.\n",
- " feat = img_tokens.transpose(1, 2).view(B, C, H, W)"
+ "conf = OmegaConf.load(path)"
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
- "data": {
- "text/plain": [
- "torch.Size([2, 32, 36, 40])"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "seed: 4711\n",
+ "network:\n",
+ " desc: Configuration of the PyTorch neural network.\n",
+ " type: CNNTransformer\n",
+ " args:\n",
+ " encoder:\n",
+ " type: EfficientNet\n",
+ " args: null\n",
+ " num_decoder_layers: 4\n",
+ " hidden_dim: 256\n",
+ " num_heads: 4\n",
+ " expansion_dim: 1024\n",
+ " dropout_rate: 0.1\n",
+ " transformer_activation: glu\n",
+ "model:\n",
+ " desc: Configuration of the PyTorch Lightning model.\n",
+ " type: LitTransformerModel\n",
+ " args:\n",
+ " optimizer:\n",
+ " type: MADGRAD\n",
+ " args:\n",
+ " lr: 0.001\n",
+ " momentum: 0.9\n",
+ " weight_decay: 0\n",
+ " eps: 1.0e-06\n",
+ " lr_scheduler:\n",
+ " type: OneCycleLR\n",
+ " args:\n",
+ " interval: step\n",
+ " max_lr: 0.001\n",
+ " three_phase: true\n",
+ " epochs: 512\n",
+ " steps_per_epoch: 1246\n",
+ " criterion:\n",
+ " type: CrossEntropyLoss\n",
+ " args:\n",
+ " weight: None\n",
+ " ignore_index: -100\n",
+ " reduction: mean\n",
+ " monitor: val_loss\n",
+ " mapping: sentence_piece\n",
+ "data:\n",
+ " desc: Configuration of the training/test data.\n",
+ " type: IAMExtendedParagraphs\n",
+ " args:\n",
+ " batch_size: 16\n",
+ " num_workers: 12\n",
+ " train_fraction: 0.8\n",
+ " augment: true\n",
+ "callbacks:\n",
+ "- type: ModelCheckpoint\n",
+ " args:\n",
+ " monitor: val_loss\n",
+ " mode: min\n",
+ " save_last: true\n",
+ "- type: StochasticWeightAveraging\n",
+ " args:\n",
+ " swa_epoch_start: 0.8\n",
+ " swa_lrs: 0.05\n",
+ " annealing_epochs: 10\n",
+ " annealing_strategy: cos\n",
+ " device: null\n",
+ "- type: LearningRateMonitor\n",
+ " args:\n",
+ " logging_interval: step\n",
+ "- type: EarlyStopping\n",
+ " args:\n",
+ " monitor: val_loss\n",
+ " mode: min\n",
+ " patience: 10\n",
+ "trainer:\n",
+ " desc: Configuration of the PyTorch Lightning Trainer.\n",
+ " args:\n",
+ " stochastic_weight_avg: true\n",
+ " auto_scale_batch_size: binsearch\n",
+ " gradient_clip_val: 0\n",
+ " fast_dev_run: false\n",
+ " gpus: 1\n",
+ " precision: 16\n",
+ " max_epochs: 512\n",
+ " terminate_on_nan: true\n",
+ " weights_summary: true\n",
+ "load_checkpoint: null\n",
+ "\n"
+ ]
}
],
"source": [
- "xxx.transpose(1, 2).view(2, 32, 36, 40).shape"
+ "print(OmegaConf.to_yaml(conf))"
]
},
{
"cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "72.0"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
"source": [
- "576 / 8"
+ "from text_recognizer.networks.cnn_transformer import CNNTransformer"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 9,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "80.0"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "640 / 8"
+ "t = CNNTransformer(input_shape=(1, 576, 640), output_shape=(682, 1), **conf.network.args)"
]
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "torch.Size([2, 1, 576, 640])"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "datum.shape"
+ "t.encode(datum).shape"
]
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "torch.Size([2, 128, 18, 20])"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "vae.encoder(datum)[0].shape"
+ "trg.shape"
]
},
{
"cell_type": "code",
- "execution_count": 87,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "torch.Size([2, 1, 576, 640])"
+ "torch.Size([2, 682, 1004])"
]
},
- "execution_count": 87,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "vae(datum)[0].shape"
+ "t(datum, trg).shape"
]
},
{