summaryrefslogtreecommitdiff
path: root/notebooks/00-testing-stuff-out.ipynb
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2021-04-24 23:09:20 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2021-04-24 23:09:20 +0200
commit4e60c836fb710baceba570c28c06437db3ad5c9b (patch)
tree21caf6d1792bd83a47fb3d372ee7120211e83f18 /notebooks/00-testing-stuff-out.ipynb
parent1ca8b0b9e0613c1e02f6a5d8b49e20c4d6916412 (diff)
Implementing CoaT transformer, continue tomorrow...
Diffstat (limited to 'notebooks/00-testing-stuff-out.ipynb')
-rw-r--r--notebooks/00-testing-stuff-out.ipynb403
1 files changed, 285 insertions, 118 deletions
diff --git a/notebooks/00-testing-stuff-out.ipynb b/notebooks/00-testing-stuff-out.ipynb
index d4840ef..e6cf099 100644
--- a/notebooks/00-testing-stuff-out.ipynb
+++ b/notebooks/00-testing-stuff-out.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -52,7 +52,7 @@
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
@@ -62,31 +62,37 @@
"seed: 4711\n",
"network:\n",
" desc: Configuration of the PyTorch neural network.\n",
- " type: ImageTransformer\n",
+ " type: VQVAE\n",
" args:\n",
" in_channels: 1\n",
" channels:\n",
- " - 128\n",
- " - 64\n",
" - 32\n",
+ " - 64\n",
+ " - 96\n",
+ " - 96\n",
+ " - 128\n",
" kernel_sizes:\n",
" - 4\n",
" - 4\n",
" - 4\n",
+ " - 4\n",
+ " - 4\n",
" strides:\n",
" - 2\n",
" - 2\n",
" - 2\n",
- " num_residual_layers: 4\n",
+ " - 2\n",
+ " - 2\n",
+ " num_residual_layers: 2\n",
" embedding_dim: 128\n",
" num_embeddings: 1024\n",
" upsampling: null\n",
- " beta: 6.6\n",
+ " beta: 0.25\n",
" activation: leaky_relu\n",
- " dropout_rate: 0.25\n",
+ " dropout_rate: 0.1\n",
"model:\n",
" desc: Configuration of the PyTorch Lightning model.\n",
- " type: LitTransformerModel\n",
+ " type: LitVQVAEModel\n",
" args:\n",
" optimizer:\n",
" type: MADGRAD\n",
@@ -96,18 +102,16 @@
" weight_decay: 0\n",
" eps: 1.0e-06\n",
" lr_scheduler:\n",
- " type: OneCycle\n",
+ " type: OneCycleLR\n",
" args:\n",
" interval: step\n",
" max_lr: 0.001\n",
" three_phase: true\n",
- " epochs: 512\n",
- " steps_per_epoch: 1246\n",
+ " epochs: 1024\n",
+ " steps_per_epoch: 317\n",
" criterion:\n",
- " type: CrossEntropyLoss\n",
+ " type: MSELoss\n",
" args:\n",
- " weight: None\n",
- " ignore_index: -100\n",
" reduction: mean\n",
" monitor: val_loss\n",
" mapping: sentence_piece\n",
@@ -115,7 +119,7 @@
" desc: Configuration of the training/test data.\n",
" type: IAMExtendedParagraphs\n",
" args:\n",
- " batch_size: 16\n",
+ " batch_size: 64\n",
" num_workers: 12\n",
" train_fraction: 0.8\n",
" augment: true\n",
@@ -125,33 +129,21 @@
" monitor: val_loss\n",
" mode: min\n",
" save_last: true\n",
- "- type: StochasticWeightAveraging\n",
- " args:\n",
- " swa_epoch_start: 0.8\n",
- " swa_lrs: 0.05\n",
- " annealing_epochs: 10\n",
- " annealing_strategy: cos\n",
- " device: null\n",
"- type: LearningRateMonitor\n",
" args:\n",
" logging_interval: step\n",
- "- type: EarlyStopping\n",
- " args:\n",
- " monitor: val_loss\n",
- " mode: min\n",
- " patience: 10\n",
"trainer:\n",
" desc: Configuration of the PyTorch Lightning Trainer.\n",
" args:\n",
- " stochastic_weight_avg: true\n",
+ " stochastic_weight_avg: false\n",
" auto_scale_batch_size: binsearch\n",
" gradient_clip_val: 0\n",
" fast_dev_run: false\n",
" gpus: 1\n",
" precision: 16\n",
- " max_epochs: 512\n",
+ " max_epochs: 1024\n",
" terminate_on_nan: true\n",
- " weights_summary: true\n",
+ " weights_summary: full\n",
"load_checkpoint: null\n",
"\n"
]
@@ -163,7 +155,7 @@
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -172,7 +164,7 @@
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -181,7 +173,7 @@
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 22,
"metadata": {},
"outputs": [
{
@@ -194,50 +186,44 @@
" (0): Conv2d(1, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
" (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (1): Dropout(p=0.25, inplace=False)\n",
+ " (1): Dropout(p=0.1, inplace=False)\n",
" (2): Sequential(\n",
" (0): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
" (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
" (4): Sequential(\n",
- " (0): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (0): Conv2d(64, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
" (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (5): Dropout(p=0.25, inplace=False)\n",
- " (6): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " )\n",
+ " (5): Dropout(p=0.1, inplace=False)\n",
+ " (6): Sequential(\n",
+ " (0): Conv2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (7): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " )\n",
+ " (7): Dropout(p=0.1, inplace=False)\n",
+ " (8): Sequential(\n",
+ " (0): Conv2d(96, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (8): _ResidualBlock(\n",
+ " (9): Dropout(p=0.1, inplace=False)\n",
+ " (10): _ResidualBlock(\n",
" (block): Sequential(\n",
" (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
" (1): ReLU(inplace=True)\n",
" (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
- " (9): _ResidualBlock(\n",
+ " (11): _ResidualBlock(\n",
" (block): Sequential(\n",
" (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
" (1): ReLU(inplace=True)\n",
" (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
- " (10): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n",
+ " (12): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" )\n",
" (vector_quantizer): VectorQuantizer(\n",
" (embedding): Embedding(1024, 128)\n",
@@ -251,7 +237,7 @@
" (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
" (1): ReLU(inplace=True)\n",
" (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): _ResidualBlock(\n",
@@ -259,39 +245,33 @@
" (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
" (1): ReLU(inplace=True)\n",
" (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " )\n",
- " )\n",
- " (3): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " )\n",
- " )\n",
- " (4): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (upsampling_block): Sequential(\n",
" (0): Sequential(\n",
- " (0): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
" (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (1): Dropout(p=0.25, inplace=False)\n",
+ " (1): Dropout(p=0.1, inplace=False)\n",
" (2): Sequential(\n",
+ " (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
+ " )\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
+ " (4): Sequential(\n",
+ " (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
+ " )\n",
+ " (5): Dropout(p=0.1, inplace=False)\n",
+ " (6): Sequential(\n",
" (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
" (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " (4): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (5): Tanh()\n",
+ " (7): Dropout(p=0.1, inplace=False)\n",
+ " (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (9): Tanh()\n",
" )\n",
" (decoder): Sequential(\n",
" (0): Sequential(\n",
@@ -301,7 +281,7 @@
" (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
" (1): ReLU(inplace=True)\n",
" (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): _ResidualBlock(\n",
@@ -309,46 +289,40 @@
" (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
" (1): ReLU(inplace=True)\n",
" (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " )\n",
- " )\n",
- " (3): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " )\n",
- " )\n",
- " (4): _ResidualBlock(\n",
- " (block): Sequential(\n",
- " (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
- " (1): ReLU(inplace=True)\n",
- " (2): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): Sequential(\n",
" (0): Sequential(\n",
- " (0): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (0): ConvTranspose2d(128, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
" (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (1): Dropout(p=0.25, inplace=False)\n",
+ " (1): Dropout(p=0.1, inplace=False)\n",
" (2): Sequential(\n",
+ " (0): ConvTranspose2d(96, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
+ " )\n",
+ " (3): Dropout(p=0.1, inplace=False)\n",
+ " (4): Sequential(\n",
+ " (0): ConvTranspose2d(96, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
+ " )\n",
+ " (5): Dropout(p=0.1, inplace=False)\n",
+ " (6): Sequential(\n",
" (0): ConvTranspose2d(64, 32, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
" (1): LeakyReLU(negative_slope=0.01, inplace=True)\n",
" )\n",
- " (3): Dropout(p=0.25, inplace=False)\n",
- " (4): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
- " (5): Tanh()\n",
+ " (7): Dropout(p=0.1, inplace=False)\n",
+ " (8): ConvTranspose2d(32, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))\n",
+ " (9): Tanh()\n",
" )\n",
" )\n",
" )\n",
")"
]
},
- "execution_count": 79,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -359,36 +333,229 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "datum = torch.randn([2, 1, 576, 640])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "proj = nn.Conv2d(1, 32, kernel_size=16, stride=16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x = proj(datum)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "tensor([1.])"
+ "torch.Size([2, 32, 36, 40])"
]
},
- "execution_count": 80,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "torch.Tensor([1])"
+ "x.shape"
]
},
{
"cell_type": "code",
- "execution_count": 81,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
- "datum = torch.randn([2, 1, 576, 640])"
+ "xx = x.flatten(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([2, 32, 1440])"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "xx.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "xxx = xx.transpose(1,2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([2, 1440, 32])"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "xxx.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from einops import rearrange"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "xxxx = rearrange(x, \"b c h w -> b ( h w ) c\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([2, 1440, 32])"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "xxxx.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ " B, N, C = x.shape\n",
+ " H, W = size\n",
+ " assert N == 1 + H * W\n",
+ "\n",
+ " # Extract CLS token and image tokens.\n",
+ " cls_token, img_tokens = x[:, :1], x[:, 1:] # Shape: [B, 1, C], [B, H*W, C].\n",
+ " \n",
+ " # Depthwise convolution.\n",
+ " feat = img_tokens.transpose(1, 2).view(B, C, H, W)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([2, 32, 36, 40])"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "xxx.transpose(1, 2).view(2, 32, 36, 40).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "72.0"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "576 / 8"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "80.0"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "640 / 8"
]
},
{
"cell_type": "code",
- "execution_count": 82,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -397,7 +564,7 @@
"torch.Size([2, 1, 576, 640])"
]
},
- "execution_count": 82,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -408,16 +575,16 @@
},
{
"cell_type": "code",
- "execution_count": 85,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "torch.Size([2, 128, 72, 80])"
+ "torch.Size([2, 128, 18, 20])"
]
},
- "execution_count": 85,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}