summaryrefslogtreecommitdiff
path: root/notebooks
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2021-08-04 05:03:51 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2021-08-04 05:03:51 +0200
commitd3afa310f77f47553586eeee58e3d3345a754e2c (patch)
tree08b7de1daf2550852d0a1e4d4d75202f14bb03d4 /notebooks
parent65d5f6c694e73792e40ed693a1381a792da8d277 (diff)
New VQVAE
Diffstat (limited to 'notebooks')
-rw-r--r--notebooks/00-scratch-pad.ipynb220
-rw-r--r--notebooks/05c-test-model-end-to-end.ipynb367
2 files changed, 378 insertions, 209 deletions
diff --git a/notebooks/00-scratch-pad.ipynb b/notebooks/00-scratch-pad.ipynb
index a193107..9f056bc 100644
--- a/notebooks/00-scratch-pad.ipynb
+++ b/notebooks/00-scratch-pad.ipynb
@@ -29,6 +29,209 @@
},
{
"cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t = torch.randint(0, 5, (4, 4))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "36"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "576 // 16"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "40"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "640 // 16"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1440"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "36 * 40"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([[0, 1, 2, 1],\n",
+ " [1, 2, 3, 3],\n",
+ " [2, 2, 3, 3],\n",
+ " [4, 0, 2, 4]])"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "t"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t = torch.randint(0, 5, (1, 4, 4, 4))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([[[[2, 3, 3, 3],\n",
+ " [3, 4, 4, 2],\n",
+ " [2, 3, 0, 0],\n",
+ " [4, 3, 4, 0]],\n",
+ "\n",
+ " [[3, 0, 3, 0],\n",
+ " [1, 4, 1, 3],\n",
+ " [2, 3, 3, 3],\n",
+ " [2, 3, 3, 1]],\n",
+ "\n",
+ " [[1, 1, 0, 3],\n",
+ " [1, 3, 0, 4],\n",
+ " [3, 1, 4, 2],\n",
+ " [3, 1, 4, 3]],\n",
+ "\n",
+ " [[3, 2, 3, 4],\n",
+ " [3, 2, 3, 3],\n",
+ " [0, 2, 2, 3],\n",
+ " [4, 0, 3, 4]]]])"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "t"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 4, 16])"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "t.flatten(start_dim=2).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([[[2, 3, 3, 3, 3, 4, 4, 2, 2, 3, 0, 0, 4, 3, 4, 0],\n",
+ " [3, 0, 3, 0, 1, 4, 1, 3, 2, 3, 3, 3, 2, 3, 3, 1],\n",
+ " [1, 1, 0, 3, 1, 3, 0, 4, 3, 1, 4, 2, 3, 1, 4, 3],\n",
+ " [3, 2, 3, 4, 3, 2, 3, 3, 0, 2, 2, 3, 4, 0, 3, 4]]])"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "t.flatten(start_dim=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "__init__() got an unexpected keyword argument 'dim'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/tmp/ipykernel_6532/3641656095.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mflatten\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFlatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'dim'"
+ ]
+ }
+ ],
+ "source": [
+ "flatten = nn.Flatten(stdim=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
@@ -561,9 +764,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 65,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "__init__() missing 4 required positional arguments: 'attn_fn', 'norm_fn', 'ff_fn', and 'rotary_emb'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/tmp/ipykernel_9275/689714588.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdecoder\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDecoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdepth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_heads\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mff_kwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattn_kwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcross_attend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m~/projects/text-recognizer/text_recognizer/networks/transformer/layers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;34m\"causal\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Cannot set causality on decoder\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcausal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m: __init__() missing 4 required positional arguments: 'attn_fn', 'norm_fn', 'ff_fn', and 'rotary_emb'"
+ ]
+ }
+ ],
"source": [
"decoder = Decoder(dim=128, depth=2, num_heads=8, ff_kwargs={}, attn_kwargs={}, cross_attend=True)"
]
diff --git a/notebooks/05c-test-model-end-to-end.ipynb b/notebooks/05c-test-model-end-to-end.ipynb
index e3e92e2..850d205 100644
--- a/notebooks/05c-test-model-end-to-end.ipynb
+++ b/notebooks/05c-test-model-end-to-end.ipynb
@@ -2,19 +2,10 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 1,
"id": "1e40a88b",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The autoreload extension is already loaded. To reload it, use:\n",
- " %reload_ext autoreload\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
@@ -34,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 2,
"id": "d3a6146b-94b1-4618-a4e4-00f8e23ffdb0",
"metadata": {},
"outputs": [],
@@ -47,67 +38,8 @@
{
"cell_type": "code",
"execution_count": 3,
- "id": "6b722ca0-9c65-4f90-be4e-b7334ea81237",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "_target_: text_recognizer.models.transformer.TransformerLitModel\n",
- "interval: step\n",
- "monitor: val/loss\n",
- "start_token: <s>\n",
- "end_token: <e>\n",
- "pad_token: <p>\n",
- "\n",
- "{'_target_': 'text_recognizer.models.transformer.TransformerLitModel', 'interval': 'step', 'monitor': 'val/loss', 'start_token': '<s>', 'end_token': '<e>', 'pad_token': '<p>'}\n"
- ]
- }
- ],
- "source": [
- "# context initialization\n",
- "with initialize(config_path=\"../training/conf/model/\", job_name=\"test_app\"):\n",
- " cfg = compose(config_name=\"lit_transformer\")\n",
- " print(OmegaConf.to_yaml(cfg))\n",
- " print(cfg)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5e6b49ce-7685-4491-bd0a-51487f06a237",
- "metadata": {},
- "outputs": [],
- "source": [
- "# context initialization\n",
- "with initialize(config_path=\"../training/conf/mapping/\", job_name=\"test_app\"):\n",
- " cfg = compose(config_name=\"word_piece\")\n",
- " print(OmegaConf.to_yaml(cfg))\n",
- " print(cfg)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9c797159-845e-42c6-bd65-1c976ad627cd",
- "metadata": {},
- "outputs": [],
- "source": [
- "# context initialization\n",
- "with initialize(config_path=\"../training/conf/network/\", job_name=\"test_app\"):\n",
- " cfg = compose(config_name=\"conv_transformer\")\n",
- " print(OmegaConf.to_yaml(cfg))\n",
- " print(cfg)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
"id": "764c8736-7d68-4261-a57d-face10ebbf42",
- "metadata": {
- "tags": []
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -122,8 +54,7 @@
" mode: min\n",
" verbose: false\n",
" dirpath: checkpoints/\n",
- " filename:\n",
- " epoch:02d: null\n",
+ " filename: '{epoch:02d}'\n",
" learning_rate_monitor:\n",
" _target_: pytorch_lightning.callbacks.LearningRateMonitor\n",
" logging_interval: step\n",
@@ -139,20 +70,20 @@
" _target_: callbacks.wandb_callbacks.UploadCheckpointsAsArtifact\n",
" ckpt_dir: checkpoints/\n",
" upload_best_only: true\n",
- " log_text_predictions:\n",
- " _target_: callbacks.wandb_callbacks.LogTextPredictions\n",
+ " log_image_reconstruction:\n",
+ " _target_: callbacks.wandb_callbacks.LogReconstuctedImages\n",
" num_samples: 8\n",
"criterion:\n",
- " _target_: text_recognizer.criterions.label_smoothing.LabelSmoothingLoss\n",
- " smoothing: 0.1\n",
- " ignore_index: 1002\n",
+ " _target_: torch.nn.MSELoss\n",
+ " reduction: mean\n",
"datamodule:\n",
" _target_: text_recognizer.data.iam_extended_paragraphs.IAMExtendedParagraphs\n",
- " batch_size: 8\n",
+ " batch_size: 32\n",
" num_workers: 12\n",
" train_fraction: 0.8\n",
" augment: true\n",
" pin_memory: false\n",
+ " word_pieces: true\n",
"logger:\n",
" wandb:\n",
" _target_: pytorch_lightning.loggers.wandb.WandbLogger\n",
@@ -170,8 +101,8 @@
" _target_: torch.optim.lr_scheduler.OneCycleLR\n",
" max_lr: 0.001\n",
" total_steps: null\n",
- " epochs: 512\n",
- " steps_per_epoch: 4992\n",
+ " epochs: 64\n",
+ " steps_per_epoch: 624\n",
" pct_start: 0.3\n",
" anneal_strategy: cos\n",
" cycle_momentum: true\n",
@@ -199,52 +130,21 @@
"\n",
" '\n",
"model:\n",
- " _target_: text_recognizer.models.transformer.TransformerLitModel\n",
+ " _target_: text_recognizer.models.vqvae.VQVAELitModel\n",
" interval: step\n",
" monitor: val/loss\n",
- " max_output_len: 451\n",
- " start_token: <s>\n",
- " end_token: <e>\n",
- " pad_token: <p>\n",
"network:\n",
- " encoder:\n",
- " _target_: text_recognizer.networks.encoders.efficientnet.EfficientNet\n",
- " arch: b0\n",
- " out_channels: 1280\n",
- " stochastic_dropout_rate: 0.2\n",
- " bn_momentum: 0.99\n",
- " bn_eps: 0.001\n",
- " decoder:\n",
- " _target_: text_recognizer.networks.transformer.Decoder\n",
- " dim: 96\n",
- " depth: 2\n",
- " num_heads: 8\n",
- " attn_fn: text_recognizer.networks.transformer.attention.Attention\n",
- " attn_kwargs:\n",
- " dim_head: 16\n",
- " dropout_rate: 0.2\n",
- " norm_fn: torch.nn.LayerNorm\n",
- " ff_fn: text_recognizer.networks.transformer.mlp.FeedForward\n",
- " ff_kwargs:\n",
- " dim_out: null\n",
- " expansion_factor: 4\n",
- " glu: true\n",
- " dropout_rate: 0.2\n",
- " cross_attend: true\n",
- " pre_norm: true\n",
- " rotary_emb: null\n",
- " _target_: text_recognizer.networks.conv_transformer.ConvTransformer\n",
- " input_dims:\n",
- " - 1\n",
- " - 576\n",
- " - 640\n",
- " hidden_dim: 96\n",
- " dropout_rate: 0.2\n",
- " num_classes: 1006\n",
- " pad_index: 1002\n",
+ " _target_: text_recognizer.networks.vqvae.VQVAE\n",
+ " in_channels: 1\n",
+ " res_channels: 32\n",
+ " num_residual_layers: 2\n",
+ " embedding_dim: 64\n",
+ " num_embeddings: 512\n",
+ " decay: 0.99\n",
+ " activation: mish\n",
"optimizer:\n",
" _target_: madgrad.MADGRAD\n",
- " lr: 0.001\n",
+ " lr: 0.01\n",
" momentum: 0.9\n",
" weight_decay: 0\n",
" eps: 1.0e-06\n",
@@ -257,7 +157,7 @@
" fast_dev_run: false\n",
" gpus: 1\n",
" precision: 16\n",
- " max_epochs: 512\n",
+ " max_epochs: 64\n",
" terminate_on_nan: true\n",
" weights_summary: top\n",
" limit_train_batches: 1.0\n",
@@ -269,91 +169,181 @@
"train: true\n",
"test: true\n",
"logging: INFO\n",
+ "work_dir: ${hydra:runtime.cwd}\n",
"debug: false\n",
+ "print_config: true\n",
+ "ignore_warnings: true\n",
"\n",
- "{'callbacks': {'model_checkpoint': {'_target_': 'pytorch_lightning.callbacks.ModelCheckpoint', 'monitor': 'val/loss', 'save_top_k': 1, 'save_last': True, 'mode': 'min', 'verbose': False, 'dirpath': 'checkpoints/', 'filename': {'epoch:02d': None}}, 'learning_rate_monitor': {'_target_': 'pytorch_lightning.callbacks.LearningRateMonitor', 'logging_interval': 'step', 'log_momentum': False}, 'watch_model': {'_target_': 'callbacks.wandb_callbacks.WatchModel', 'log': 'all', 'log_freq': 100}, 'upload_code_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCodeAsArtifact', 'project_dir': '${work_dir}/text_recognizer'}, 'upload_ckpts_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCheckpointsAsArtifact', 'ckpt_dir': 'checkpoints/', 'upload_best_only': True}, 'log_text_predictions': {'_target_': 'callbacks.wandb_callbacks.LogTextPredictions', 'num_samples': 8}}, 'criterion': {'_target_': 'text_recognizer.criterions.label_smoothing.LabelSmoothingLoss', 'smoothing': 0.1, 'ignore_index': 1002}, 'datamodule': {'_target_': 'text_recognizer.data.iam_extended_paragraphs.IAMExtendedParagraphs', 'batch_size': 8, 'num_workers': 12, 'train_fraction': 0.8, 'augment': True, 'pin_memory': False}, 'logger': {'wandb': {'_target_': 'pytorch_lightning.loggers.wandb.WandbLogger', 'project': 'text-recognizer', 'name': None, 'save_dir': '.', 'offline': False, 'id': None, 'log_model': False, 'prefix': '', 'job_type': 'train', 'group': '', 'tags': []}}, 'lr_scheduler': {'_target_': 'torch.optim.lr_scheduler.OneCycleLR', 'max_lr': 0.001, 'total_steps': None, 'epochs': 512, 'steps_per_epoch': 4992, 'pct_start': 0.3, 'anneal_strategy': 'cos', 'cycle_momentum': True, 'base_momentum': 0.85, 'max_momentum': 0.95, 'div_factor': 25.0, 'final_div_factor': 10000.0, 'three_phase': True, 'last_epoch': -1, 'verbose': False}, 'mapping': {'_target_': 'text_recognizer.data.word_piece_mapping.WordPieceMapping', 'num_features': 1000, 'tokens': 'iamdb_1kwp_tokens_1000.txt', 'lexicon': 'iamdb_1kwp_lex_1000.txt', 'data_dir': None, 'use_words': False, 'prepend_wordsep': False, 'special_tokens': ['<s>', '<e>', '<p>'], 'extra_symbols': ['\\n']}, 'model': {'_target_': 'text_recognizer.models.transformer.TransformerLitModel', 'interval': 'step', 'monitor': 'val/loss', 'max_output_len': 451, 'start_token': '<s>', 'end_token': '<e>', 'pad_token': '<p>'}, 'network': {'encoder': {'_target_': 'text_recognizer.networks.encoders.efficientnet.EfficientNet', 'arch': 'b0', 'out_channels': 1280, 'stochastic_dropout_rate': 0.2, 'bn_momentum': 0.99, 'bn_eps': 0.001}, 'decoder': {'_target_': 'text_recognizer.networks.transformer.Decoder', 'dim': 96, 'depth': 2, 'num_heads': 8, 'attn_fn': 'text_recognizer.networks.transformer.attention.Attention', 'attn_kwargs': {'dim_head': 16, 'dropout_rate': 0.2}, 'norm_fn': 'torch.nn.LayerNorm', 'ff_fn': 'text_recognizer.networks.transformer.mlp.FeedForward', 'ff_kwargs': {'dim_out': None, 'expansion_factor': 4, 'glu': True, 'dropout_rate': 0.2}, 'cross_attend': True, 'pre_norm': True, 'rotary_emb': None}, '_target_': 'text_recognizer.networks.conv_transformer.ConvTransformer', 'input_dims': [1, 576, 640], 'hidden_dim': 96, 'dropout_rate': 0.2, 'num_classes': 1006, 'pad_index': 1002}, 'optimizer': {'_target_': 'madgrad.MADGRAD', 'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0, 'eps': 1e-06}, 'trainer': {'_target_': 'pytorch_lightning.Trainer', 'stochastic_weight_avg': False, 'auto_scale_batch_size': 'binsearch', 'auto_lr_find': False, 'gradient_clip_val': 0, 'fast_dev_run': False, 'gpus': 1, 'precision': 16, 'max_epochs': 512, 'terminate_on_nan': True, 'weights_summary': 'top', 'limit_train_batches': 1.0, 'limit_val_batches': 1.0, 'limit_test_batches': 1.0, 'resume_from_checkpoint': None}, 'seed': 4711, 'tune': False, 'train': True, 'test': True, 'logging': 'INFO', 'debug': False}\n"
+ "{'callbacks': {'model_checkpoint': {'_target_': 'pytorch_lightning.callbacks.ModelCheckpoint', 'monitor': 'val/loss', 'save_top_k': 1, 'save_last': True, 'mode': 'min', 'verbose': False, 'dirpath': 'checkpoints/', 'filename': '{epoch:02d}'}, 'learning_rate_monitor': {'_target_': 'pytorch_lightning.callbacks.LearningRateMonitor', 'logging_interval': 'step', 'log_momentum': False}, 'watch_model': {'_target_': 'callbacks.wandb_callbacks.WatchModel', 'log': 'all', 'log_freq': 100}, 'upload_code_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCodeAsArtifact', 'project_dir': '${work_dir}/text_recognizer'}, 'upload_ckpts_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCheckpointsAsArtifact', 'ckpt_dir': 'checkpoints/', 'upload_best_only': True}, 'log_image_reconstruction': {'_target_': 'callbacks.wandb_callbacks.LogReconstuctedImages', 'num_samples': 8}}, 'criterion': {'_target_': 'torch.nn.MSELoss', 'reduction': 'mean'}, 'datamodule': {'_target_': 'text_recognizer.data.iam_extended_paragraphs.IAMExtendedParagraphs', 'batch_size': 32, 'num_workers': 12, 'train_fraction': 0.8, 'augment': True, 'pin_memory': False, 'word_pieces': True}, 'logger': {'wandb': {'_target_': 'pytorch_lightning.loggers.wandb.WandbLogger', 'project': 'text-recognizer', 'name': None, 'save_dir': '.', 'offline': False, 'id': None, 'log_model': False, 'prefix': '', 'job_type': 'train', 'group': '', 'tags': []}}, 'lr_scheduler': {'_target_': 'torch.optim.lr_scheduler.OneCycleLR', 'max_lr': 0.001, 'total_steps': None, 'epochs': 64, 'steps_per_epoch': 624, 'pct_start': 0.3, 'anneal_strategy': 'cos', 'cycle_momentum': True, 'base_momentum': 0.85, 'max_momentum': 0.95, 'div_factor': 25.0, 'final_div_factor': 10000.0, 'three_phase': True, 'last_epoch': -1, 'verbose': False}, 'mapping': {'_target_': 'text_recognizer.data.word_piece_mapping.WordPieceMapping', 'num_features': 1000, 'tokens': 'iamdb_1kwp_tokens_1000.txt', 'lexicon': 'iamdb_1kwp_lex_1000.txt', 'data_dir': None, 'use_words': False, 'prepend_wordsep': False, 'special_tokens': ['<s>', '<e>', '<p>'], 'extra_symbols': ['\\n']}, 'model': {'_target_': 'text_recognizer.models.vqvae.VQVAELitModel', 'interval': 'step', 'monitor': 'val/loss'}, 'network': {'_target_': 'text_recognizer.networks.vqvae.VQVAE', 'in_channels': 1, 'res_channels': 32, 'num_residual_layers': 2, 'embedding_dim': 64, 'num_embeddings': 512, 'decay': 0.99, 'activation': 'mish'}, 'optimizer': {'_target_': 'madgrad.MADGRAD', 'lr': 0.01, 'momentum': 0.9, 'weight_decay': 0, 'eps': 1e-06}, 'trainer': {'_target_': 'pytorch_lightning.Trainer', 'stochastic_weight_avg': False, 'auto_scale_batch_size': 'binsearch', 'auto_lr_find': False, 'gradient_clip_val': 0, 'fast_dev_run': False, 'gpus': 1, 'precision': 16, 'max_epochs': 64, 'terminate_on_nan': True, 'weights_summary': 'top', 'limit_train_batches': 1.0, 'limit_val_batches': 1.0, 'limit_test_batches': 1.0, 'resume_from_checkpoint': None}, 'seed': 4711, 'tune': False, 'train': True, 'test': True, 'logging': 'INFO', 'work_dir': '${hydra:runtime.cwd}', 'debug': False, 'print_config': True, 'ignore_warnings': True}\n"
]
}
],
"source": [
"# context initialization\n",
"with initialize(config_path=\"../training/conf/\", job_name=\"test_app\"):\n",
- " cfg = compose(config_name=\"config\")\n",
+ " cfg = compose(config_name=\"config\", overrides=[\"+experiment=vqvae\"])\n",
" print(OmegaConf.to_yaml(cfg))\n",
" print(cfg)"
]
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "9382f0ab-8760-4d59-b0b5-b8b65dd1ea31",
+ "execution_count": 4,
+ "id": "c1a9aa6b-6405-4ffe-b065-02340762476a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2021-08-04 04:49:04.188 | DEBUG | text_recognizer.data.word_piece_mapping:__init__:37 - Using data dir: /home/aktersnurra/projects/text-recognizer/data/downloaded/iam/iamdb\n"
+ ]
+ }
+ ],
+ "source": [
+ "mapping = instantiate(cfg.mapping)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "969ba3be-d78f-4b1e-b522-ea8a42669e86",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "network = instantiate(cfg.network)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "6147cd3e-0ad1-490f-917d-21be9bb8ce1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x = torch.rand(1, 1, 576, 640)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "a0ecea0c-abaf-4d5d-a13d-c085c1e4d282",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{'model_checkpoint': {'_target_': 'pytorch_lightning.callbacks.ModelCheckpoint', 'monitor': 'val/loss', 'save_top_k': 1, 'save_last': True, 'mode': 'min', 'verbose': False, 'dirpath': 'checkpoints/', 'filename': {'epoch:02d': None}}, 'learning_rate_monitor': {'_target_': 'pytorch_lightning.callbacks.LearningRateMonitor', 'logging_interval': 'step', 'log_momentum': False}, 'watch_model': {'_target_': 'callbacks.wandb_callbacks.WatchModel', 'log': 'all', 'log_freq': 100}, 'upload_code_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCodeAsArtifact', 'project_dir': '${work_dir}/text_recognizer'}, 'upload_ckpts_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCheckpointsAsArtifact', 'ckpt_dir': 'checkpoints/', 'upload_best_only': True}, 'log_text_predictions': {'_target_': 'callbacks.wandb_callbacks.LogTextPredictions', 'num_samples': 8}}"
+ "torch.Size([1, 64, 144, 160])"
]
},
- "execution_count": 10,
+ "execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "cfg.get(\"callbacks\")"
+ "network.encode(x)[0].shape"
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "216d5680-66bf-4190-9401-1a59dbbc43af",
+ "execution_count": 38,
+ "id": "a7b9f249-7e5e-4f31-bbe1-cfd6d3701cf0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "pytorch_lightning.callbacks.ModelCheckpoint\n",
- "pytorch_lightning.callbacks.LearningRateMonitor\n",
- "callbacks.wandb_callbacks.WatchModel\n",
- "callbacks.wandb_callbacks.UploadCodeAsArtifact\n",
- "callbacks.wandb_callbacks.UploadCheckpointsAsArtifact\n",
- "callbacks.wandb_callbacks.LogTextPredictions\n"
+ "torch.Size([512])\n",
+ "torch.Size([512])\n",
+ "torch.Size([512])\n",
+ "torch.Size([512])\n"
]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 1, 576, 640])"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "for l in cfg.callbacks.values():\n",
- " print(l.get(\"_target_\"))"
+ "network(x)[0].shape"
]
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "c1a9aa6b-6405-4ffe-b065-02340762476a",
+ "execution_count": null,
+ "id": "23c9d90c-042b-423e-ab85-18449e29ded4",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2021-08-03 15:27:02.069 | DEBUG | text_recognizer.data.word_piece_mapping:__init__:37 - Using data dir: /home/aktersnurra/projects/text-recognizer/data/downloaded/iam/iamdb\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "mapping = instantiate(cfg.mapping)"
+ "576 / 4"
]
},
{
"cell_type": "code",
- "execution_count": 5,
- "id": "969ba3be-d78f-4b1e-b522-ea8a42669e86",
+ "execution_count": null,
+ "id": "047ebc09-1c74-44a7-a314-1099f09722fe",
"metadata": {},
"outputs": [],
"source": [
- "network = instantiate(cfg.network)"
+ "t = torch.randint(0, 1006, (1, 451)).cuda()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87372dde-2b1a-432b-ab79-0b116124c724",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "z = torch.rand((1, 36 * 40, 128)).cuda()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf7ca9bf-cafa-4128-9db7-046c16933a52",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "network = network.cuda()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dfceaa5f-9ad8-4d33-addb-c56e8da48356",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "network.decode(z, t).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9105fbbb-4363-4d3e-a01e-bc519c3b9c3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "decoder = decoder.cuda()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5797ec4-7a6a-46fd-8adc-265df44d0341",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "decoder(z, t).shape"
]
},
{
@@ -368,11 +358,9 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "a6fae1fa-492d-4648-80fd-1c0dac659b02",
- "metadata": {
- "tags": []
- },
+ "metadata": {},
"outputs": [],
"source": [
"datamodule = instantiate(cfg.datamodule, mapping=mapping)"
@@ -380,19 +368,10 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "514053ef-fcac-4f3c-a7c8-72c6927d6798",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2021-08-03 15:28:22.541 | INFO | text_recognizer.data.iam_paragraphs:setup:95 - Loading IAM paragraph regions and lines for None...\n",
- "2021-08-03 15:28:45.280 | INFO | text_recognizer.data.iam_synthetic_paragraphs:setup:68 - IAM Synthetic dataset steup for stage None...\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"datamodule.prepare_data()\n",
"datamodule.setup()"
@@ -400,21 +379,10 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "4bad950b-a197-4c60-ad89-903124659a98",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "4992"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"len(datamodule.train_dataloader())"
]
@@ -431,7 +399,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"id": "f6e01c15-9a1b-4036-87ae-78716c592264",
"metadata": {},
"outputs": [],
@@ -441,7 +409,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "4dc475fc-31f4-487e-88c8-b0f445131f5b",
"metadata": {},
"outputs": [],
@@ -451,7 +419,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "c5c8ed64-d98c-47b5-baf2-1ba57a6c882f",
"metadata": {},
"outputs": [],
@@ -461,11 +429,9 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "b5ff5b24-f804-402b-a8ab-f366443025ca",
- "metadata": {
- "tags": []
- },
+ "metadata": {},
"outputs": [],
"source": [
" model = hydra.utils.instantiate(\n",
@@ -481,21 +447,10 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "99f8a39f-8b10-4f7d-8bff-52794fd48717",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "<bound method WordPieceMapping.get_index of <text_recognizer.data.word_piece_mapping.WordPieceMapping object at 0x7fae3b489610>>"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"mapping.get_index"
]
@@ -514,9 +469,7 @@
"cell_type": "code",
"execution_count": null,
"id": "8f0742ad-5e2f-42d5-83e7-6e46398b4f0f",
- "metadata": {
- "tags": []
- },
+ "metadata": {},
"outputs": [],
"source": [
"net"