20 files changed, 786 insertions, 568 deletions
diff --git a/notebooks/00-scratch-pad.ipynb b/notebooks/00-scratch-pad.ipynb
index a193107..9f056bc 100644
--- a/notebooks/00-scratch-pad.ipynb
+++ b/notebooks/00-scratch-pad.ipynb
@@ -29,6 +29,209 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = torch.randint(0, 5, (4, 4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "36"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "576 // 16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "40"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "640 // 16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1440"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "36 * 40"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[0, 1, 2, 1],\n",
+       "        [1, 2, 3, 3],\n",
+       "        [2, 2, 3, 3],\n",
+       "        [4, 0, 2, 4]])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = torch.randint(0, 5, (1, 4, 4, 4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[[2, 3, 3, 3],\n",
+       "          [3, 4, 4, 2],\n",
+       "          [2, 3, 0, 0],\n",
+       "          [4, 3, 4, 0]],\n",
+       "\n",
+       "         [[3, 0, 3, 0],\n",
+       "          [1, 4, 1, 3],\n",
+       "          [2, 3, 3, 3],\n",
+       "          [2, 3, 3, 1]],\n",
+       "\n",
+       "         [[1, 1, 0, 3],\n",
+       "          [1, 3, 0, 4],\n",
+       "          [3, 1, 4, 2],\n",
+       "          [3, 1, 4, 3]],\n",
+       "\n",
+       "         [[3, 2, 3, 4],\n",
+       "          [3, 2, 3, 3],\n",
+       "          [0, 2, 2, 3],\n",
+       "          [4, 0, 3, 4]]]])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 4, 16])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t.flatten(start_dim=2).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[2, 3, 3, 3, 3, 4, 4, 2, 2, 3, 0, 0, 4, 3, 4, 0],\n",
+       "         [3, 0, 3, 0, 1, 4, 1, 3, 2, 3, 3, 3, 2, 3, 3, 1],\n",
+       "         [1, 1, 0, 3, 1, 3, 0, 4, 3, 1, 4, 2, 3, 1, 4, 3],\n",
+       "         [3, 2, 3, 4, 3, 2, 3, 3, 0, 2, 2, 3, 4, 0, 3, 4]]])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t.flatten(start_dim=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "__init__() got an unexpected keyword argument 'dim'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_6532/3641656095.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mflatten\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFlatten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'dim'"
+     ]
+    }
+   ],
+   "source": [
+    "flatten = nn.Flatten(stdim=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
    "outputs": [
@@ -561,9 +764,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 65,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "__init__() missing 4 required positional arguments: 'attn_fn', 'norm_fn', 'ff_fn', and 'rotary_emb'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_9275/689714588.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdecoder\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDecoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdepth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_heads\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mff_kwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattn_kwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcross_attend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/projects/text-recognizer/text_recognizer/networks/transformer/layers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    104\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0;34m\"causal\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Cannot set causality on decoder\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcausal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m: __init__() missing 4 required positional arguments: 'attn_fn', 'norm_fn', 'ff_fn', and 'rotary_emb'"
+     ]
+    }
+   ],
    "source": [
     "decoder = Decoder(dim=128, depth=2, num_heads=8, ff_kwargs={}, attn_kwargs={}, cross_attend=True)"
    ]
diff --git a/notebooks/05c-test-model-end-to-end.ipynb b/notebooks/05c-test-model-end-to-end.ipynb
index e3e92e2..850d205 100644
--- a/notebooks/05c-test-model-end-to-end.ipynb
+++ b/notebooks/05c-test-model-end-to-end.ipynb
@@ -2,19 +2,10 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "1e40a88b",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
@@ -34,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "d3a6146b-94b1-4618-a4e4-00f8e23ffdb0",
    "metadata": {},
    "outputs": [],
@@ -47,67 +38,8 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "6b722ca0-9c65-4f90-be4e-b7334ea81237",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "_target_: text_recognizer.models.transformer.TransformerLitModel\n",
-      "interval: step\n",
-      "monitor: val/loss\n",
-      "start_token: <s>\n",
-      "end_token: <e>\n",
-      "pad_token: <p>\n",
-      "\n",
-      "{'_target_': 'text_recognizer.models.transformer.TransformerLitModel', 'interval': 'step', 'monitor': 'val/loss', 'start_token': '<s>', 'end_token': '<e>', 'pad_token': '<p>'}\n"
-     ]
-    }
-   ],
-   "source": [
-    "# context initialization\n",
-    "with initialize(config_path=\"../training/conf/model/\", job_name=\"test_app\"):\n",
-    "    cfg = compose(config_name=\"lit_transformer\")\n",
-    "    print(OmegaConf.to_yaml(cfg))\n",
-    "    print(cfg)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5e6b49ce-7685-4491-bd0a-51487f06a237",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# context initialization\n",
-    "with initialize(config_path=\"../training/conf/mapping/\", job_name=\"test_app\"):\n",
-    "    cfg = compose(config_name=\"word_piece\")\n",
-    "    print(OmegaConf.to_yaml(cfg))\n",
-    "    print(cfg)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9c797159-845e-42c6-bd65-1c976ad627cd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# context initialization\n",
-    "with initialize(config_path=\"../training/conf/network/\", job_name=\"test_app\"):\n",
-    "    cfg = compose(config_name=\"conv_transformer\")\n",
-    "    print(OmegaConf.to_yaml(cfg))\n",
-    "    print(cfg)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
    "id": "764c8736-7d68-4261-a57d-face10ebbf42",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -122,8 +54,7 @@
       "    mode: min\n",
       "    verbose: false\n",
       "    dirpath: checkpoints/\n",
-      "    filename:\n",
-      "      epoch:02d: null\n",
+      "    filename: '{epoch:02d}'\n",
       "  learning_rate_monitor:\n",
       "    _target_: pytorch_lightning.callbacks.LearningRateMonitor\n",
       "    logging_interval: step\n",
@@ -139,20 +70,20 @@
       "    _target_: callbacks.wandb_callbacks.UploadCheckpointsAsArtifact\n",
       "    ckpt_dir: checkpoints/\n",
       "    upload_best_only: true\n",
-      "  log_text_predictions:\n",
-      "    _target_: callbacks.wandb_callbacks.LogTextPredictions\n",
+      "  log_image_reconstruction:\n",
+      "    _target_: callbacks.wandb_callbacks.LogReconstuctedImages\n",
       "    num_samples: 8\n",
       "criterion:\n",
-      "  _target_: text_recognizer.criterions.label_smoothing.LabelSmoothingLoss\n",
-      "  smoothing: 0.1\n",
-      "  ignore_index: 1002\n",
+      "  _target_: torch.nn.MSELoss\n",
+      "  reduction: mean\n",
       "datamodule:\n",
       "  _target_: text_recognizer.data.iam_extended_paragraphs.IAMExtendedParagraphs\n",
-      "  batch_size: 8\n",
+      "  batch_size: 32\n",
       "  num_workers: 12\n",
       "  train_fraction: 0.8\n",
       "  augment: true\n",
       "  pin_memory: false\n",
+      "  word_pieces: true\n",
       "logger:\n",
       "  wandb:\n",
       "    _target_: pytorch_lightning.loggers.wandb.WandbLogger\n",
@@ -170,8 +101,8 @@
       "  _target_: torch.optim.lr_scheduler.OneCycleLR\n",
       "  max_lr: 0.001\n",
       "  total_steps: null\n",
-      "  epochs: 512\n",
-      "  steps_per_epoch: 4992\n",
+      "  epochs: 64\n",
+      "  steps_per_epoch: 624\n",
       "  pct_start: 0.3\n",
       "  anneal_strategy: cos\n",
       "  cycle_momentum: true\n",
@@ -199,52 +130,21 @@
       "\n",
       "    '\n",
       "model:\n",
-      "  _target_: text_recognizer.models.transformer.TransformerLitModel\n",
+      "  _target_: text_recognizer.models.vqvae.VQVAELitModel\n",
       "  interval: step\n",
       "  monitor: val/loss\n",
-      "  max_output_len: 451\n",
-      "  start_token: <s>\n",
-      "  end_token: <e>\n",
-      "  pad_token: <p>\n",
       "network:\n",
-      "  encoder:\n",
-      "    _target_: text_recognizer.networks.encoders.efficientnet.EfficientNet\n",
-      "    arch: b0\n",
-      "    out_channels: 1280\n",
-      "    stochastic_dropout_rate: 0.2\n",
-      "    bn_momentum: 0.99\n",
-      "    bn_eps: 0.001\n",
-      "  decoder:\n",
-      "    _target_: text_recognizer.networks.transformer.Decoder\n",
-      "    dim: 96\n",
-      "    depth: 2\n",
-      "    num_heads: 8\n",
-      "    attn_fn: text_recognizer.networks.transformer.attention.Attention\n",
-      "    attn_kwargs:\n",
-      "      dim_head: 16\n",
-      "      dropout_rate: 0.2\n",
-      "    norm_fn: torch.nn.LayerNorm\n",
-      "    ff_fn: text_recognizer.networks.transformer.mlp.FeedForward\n",
-      "    ff_kwargs:\n",
-      "      dim_out: null\n",
-      "      expansion_factor: 4\n",
-      "      glu: true\n",
-      "      dropout_rate: 0.2\n",
-      "    cross_attend: true\n",
-      "    pre_norm: true\n",
-      "    rotary_emb: null\n",
-      "  _target_: text_recognizer.networks.conv_transformer.ConvTransformer\n",
-      "  input_dims:\n",
-      "  - 1\n",
-      "  - 576\n",
-      "  - 640\n",
-      "  hidden_dim: 96\n",
-      "  dropout_rate: 0.2\n",
-      "  num_classes: 1006\n",
-      "  pad_index: 1002\n",
+      "  _target_: text_recognizer.networks.vqvae.VQVAE\n",
+      "  in_channels: 1\n",
+      "  res_channels: 32\n",
+      "  num_residual_layers: 2\n",
+      "  embedding_dim: 64\n",
+      "  num_embeddings: 512\n",
+      "  decay: 0.99\n",
+      "  activation: mish\n",
       "optimizer:\n",
       "  _target_: madgrad.MADGRAD\n",
-      "  lr: 0.001\n",
+      "  lr: 0.01\n",
       "  momentum: 0.9\n",
       "  weight_decay: 0\n",
       "  eps: 1.0e-06\n",
@@ -257,7 +157,7 @@
       "  fast_dev_run: false\n",
       "  gpus: 1\n",
       "  precision: 16\n",
-      "  max_epochs: 512\n",
+      "  max_epochs: 64\n",
       "  terminate_on_nan: true\n",
       "  weights_summary: top\n",
       "  limit_train_batches: 1.0\n",
@@ -269,91 +169,181 @@
       "train: true\n",
       "test: true\n",
       "logging: INFO\n",
+      "work_dir: ${hydra:runtime.cwd}\n",
       "debug: false\n",
+      "print_config: true\n",
+      "ignore_warnings: true\n",
       "\n",
-      "{'callbacks': {'model_checkpoint': {'_target_': 'pytorch_lightning.callbacks.ModelCheckpoint', 'monitor': 'val/loss', 'save_top_k': 1, 'save_last': True, 'mode': 'min', 'verbose': False, 'dirpath': 'checkpoints/', 'filename': {'epoch:02d': None}}, 'learning_rate_monitor': {'_target_': 'pytorch_lightning.callbacks.LearningRateMonitor', 'logging_interval': 'step', 'log_momentum': False}, 'watch_model': {'_target_': 'callbacks.wandb_callbacks.WatchModel', 'log': 'all', 'log_freq': 100}, 'upload_code_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCodeAsArtifact', 'project_dir': '${work_dir}/text_recognizer'}, 'upload_ckpts_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCheckpointsAsArtifact', 'ckpt_dir': 'checkpoints/', 'upload_best_only': True}, 'log_text_predictions': {'_target_': 'callbacks.wandb_callbacks.LogTextPredictions', 'num_samples': 8}}, 'criterion': {'_target_': 'text_recognizer.criterions.label_smoothing.LabelSmoothingLoss', 'smoothing': 0.1, 'ignore_index': 1002}, 'datamodule': {'_target_': 'text_recognizer.data.iam_extended_paragraphs.IAMExtendedParagraphs', 'batch_size': 8, 'num_workers': 12, 'train_fraction': 0.8, 'augment': True, 'pin_memory': False}, 'logger': {'wandb': {'_target_': 'pytorch_lightning.loggers.wandb.WandbLogger', 'project': 'text-recognizer', 'name': None, 'save_dir': '.', 'offline': False, 'id': None, 'log_model': False, 'prefix': '', 'job_type': 'train', 'group': '', 'tags': []}}, 'lr_scheduler': {'_target_': 'torch.optim.lr_scheduler.OneCycleLR', 'max_lr': 0.001, 'total_steps': None, 'epochs': 512, 'steps_per_epoch': 4992, 'pct_start': 0.3, 'anneal_strategy': 'cos', 'cycle_momentum': True, 'base_momentum': 0.85, 'max_momentum': 0.95, 'div_factor': 25.0, 'final_div_factor': 10000.0, 'three_phase': True, 'last_epoch': -1, 'verbose': False}, 'mapping': {'_target_': 'text_recognizer.data.word_piece_mapping.WordPieceMapping', 'num_features': 1000, 'tokens': 'iamdb_1kwp_tokens_1000.txt', 'lexicon': 'iamdb_1kwp_lex_1000.txt', 'data_dir': None, 'use_words': False, 'prepend_wordsep': False, 'special_tokens': ['<s>', '<e>', '<p>'], 'extra_symbols': ['\\n']}, 'model': {'_target_': 'text_recognizer.models.transformer.TransformerLitModel', 'interval': 'step', 'monitor': 'val/loss', 'max_output_len': 451, 'start_token': '<s>', 'end_token': '<e>', 'pad_token': '<p>'}, 'network': {'encoder': {'_target_': 'text_recognizer.networks.encoders.efficientnet.EfficientNet', 'arch': 'b0', 'out_channels': 1280, 'stochastic_dropout_rate': 0.2, 'bn_momentum': 0.99, 'bn_eps': 0.001}, 'decoder': {'_target_': 'text_recognizer.networks.transformer.Decoder', 'dim': 96, 'depth': 2, 'num_heads': 8, 'attn_fn': 'text_recognizer.networks.transformer.attention.Attention', 'attn_kwargs': {'dim_head': 16, 'dropout_rate': 0.2}, 'norm_fn': 'torch.nn.LayerNorm', 'ff_fn': 'text_recognizer.networks.transformer.mlp.FeedForward', 'ff_kwargs': {'dim_out': None, 'expansion_factor': 4, 'glu': True, 'dropout_rate': 0.2}, 'cross_attend': True, 'pre_norm': True, 'rotary_emb': None}, '_target_': 'text_recognizer.networks.conv_transformer.ConvTransformer', 'input_dims': [1, 576, 640], 'hidden_dim': 96, 'dropout_rate': 0.2, 'num_classes': 1006, 'pad_index': 1002}, 'optimizer': {'_target_': 'madgrad.MADGRAD', 'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0, 'eps': 1e-06}, 'trainer': {'_target_': 'pytorch_lightning.Trainer', 'stochastic_weight_avg': False, 'auto_scale_batch_size': 'binsearch', 'auto_lr_find': False, 'gradient_clip_val': 0, 'fast_dev_run': False, 'gpus': 1, 'precision': 16, 'max_epochs': 512, 'terminate_on_nan': True, 'weights_summary': 'top', 'limit_train_batches': 1.0, 'limit_val_batches': 1.0, 'limit_test_batches': 1.0, 'resume_from_checkpoint': None}, 'seed': 4711, 'tune': False, 'train': True, 'test': True, 'logging': 'INFO', 'debug': False}\n"
+      "{'callbacks': {'model_checkpoint': {'_target_': 'pytorch_lightning.callbacks.ModelCheckpoint', 'monitor': 'val/loss', 'save_top_k': 1, 'save_last': True, 'mode': 'min', 'verbose': False, 'dirpath': 'checkpoints/', 'filename': '{epoch:02d}'}, 'learning_rate_monitor': {'_target_': 'pytorch_lightning.callbacks.LearningRateMonitor', 'logging_interval': 'step', 'log_momentum': False}, 'watch_model': {'_target_': 'callbacks.wandb_callbacks.WatchModel', 'log': 'all', 'log_freq': 100}, 'upload_code_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCodeAsArtifact', 'project_dir': '${work_dir}/text_recognizer'}, 'upload_ckpts_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCheckpointsAsArtifact', 'ckpt_dir': 'checkpoints/', 'upload_best_only': True}, 'log_image_reconstruction': {'_target_': 'callbacks.wandb_callbacks.LogReconstuctedImages', 'num_samples': 8}}, 'criterion': {'_target_': 'torch.nn.MSELoss', 'reduction': 'mean'}, 'datamodule': {'_target_': 'text_recognizer.data.iam_extended_paragraphs.IAMExtendedParagraphs', 'batch_size': 32, 'num_workers': 12, 'train_fraction': 0.8, 'augment': True, 'pin_memory': False, 'word_pieces': True}, 'logger': {'wandb': {'_target_': 'pytorch_lightning.loggers.wandb.WandbLogger', 'project': 'text-recognizer', 'name': None, 'save_dir': '.', 'offline': False, 'id': None, 'log_model': False, 'prefix': '', 'job_type': 'train', 'group': '', 'tags': []}}, 'lr_scheduler': {'_target_': 'torch.optim.lr_scheduler.OneCycleLR', 'max_lr': 0.001, 'total_steps': None, 'epochs': 64, 'steps_per_epoch': 624, 'pct_start': 0.3, 'anneal_strategy': 'cos', 'cycle_momentum': True, 'base_momentum': 0.85, 'max_momentum': 0.95, 'div_factor': 25.0, 'final_div_factor': 10000.0, 'three_phase': True, 'last_epoch': -1, 'verbose': False}, 'mapping': {'_target_': 'text_recognizer.data.word_piece_mapping.WordPieceMapping', 'num_features': 1000, 'tokens': 'iamdb_1kwp_tokens_1000.txt', 'lexicon': 'iamdb_1kwp_lex_1000.txt', 'data_dir': None, 'use_words': False, 'prepend_wordsep': False, 'special_tokens': ['<s>', '<e>', '<p>'], 'extra_symbols': ['\\n']}, 'model': {'_target_': 'text_recognizer.models.vqvae.VQVAELitModel', 'interval': 'step', 'monitor': 'val/loss'}, 'network': {'_target_': 'text_recognizer.networks.vqvae.VQVAE', 'in_channels': 1, 'res_channels': 32, 'num_residual_layers': 2, 'embedding_dim': 64, 'num_embeddings': 512, 'decay': 0.99, 'activation': 'mish'}, 'optimizer': {'_target_': 'madgrad.MADGRAD', 'lr': 0.01, 'momentum': 0.9, 'weight_decay': 0, 'eps': 1e-06}, 'trainer': {'_target_': 'pytorch_lightning.Trainer', 'stochastic_weight_avg': False, 'auto_scale_batch_size': 'binsearch', 'auto_lr_find': False, 'gradient_clip_val': 0, 'fast_dev_run': False, 'gpus': 1, 'precision': 16, 'max_epochs': 64, 'terminate_on_nan': True, 'weights_summary': 'top', 'limit_train_batches': 1.0, 'limit_val_batches': 1.0, 'limit_test_batches': 1.0, 'resume_from_checkpoint': None}, 'seed': 4711, 'tune': False, 'train': True, 'test': True, 'logging': 'INFO', 'work_dir': '${hydra:runtime.cwd}', 'debug': False, 'print_config': True, 'ignore_warnings': True}\n"
      ]
     }
    ],
    "source": [
     "# context initialization\n",
     "with initialize(config_path=\"../training/conf/\", job_name=\"test_app\"):\n",
-    "    cfg = compose(config_name=\"config\")\n",
+    "    cfg = compose(config_name=\"config\", overrides=[\"+experiment=vqvae\"])\n",
     "    print(OmegaConf.to_yaml(cfg))\n",
     "    print(cfg)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "9382f0ab-8760-4d59-b0b5-b8b65dd1ea31",
+   "execution_count": 4,
+   "id": "c1a9aa6b-6405-4ffe-b065-02340762476a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-08-04 04:49:04.188 | DEBUG    | text_recognizer.data.word_piece_mapping:__init__:37 - Using data dir: /home/aktersnurra/projects/text-recognizer/data/downloaded/iam/iamdb\n"
+     ]
+    }
+   ],
+   "source": [
+    "mapping = instantiate(cfg.mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "969ba3be-d78f-4b1e-b522-ea8a42669e86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "network = instantiate(cfg.network)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "6147cd3e-0ad1-490f-917d-21be9bb8ce1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = torch.rand(1, 1, 576, 640)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "a0ecea0c-abaf-4d5d-a13d-c085c1e4d282",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'model_checkpoint': {'_target_': 'pytorch_lightning.callbacks.ModelCheckpoint', 'monitor': 'val/loss', 'save_top_k': 1, 'save_last': True, 'mode': 'min', 'verbose': False, 'dirpath': 'checkpoints/', 'filename': {'epoch:02d': None}}, 'learning_rate_monitor': {'_target_': 'pytorch_lightning.callbacks.LearningRateMonitor', 'logging_interval': 'step', 'log_momentum': False}, 'watch_model': {'_target_': 'callbacks.wandb_callbacks.WatchModel', 'log': 'all', 'log_freq': 100}, 'upload_code_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCodeAsArtifact', 'project_dir': '${work_dir}/text_recognizer'}, 'upload_ckpts_as_artifact': {'_target_': 'callbacks.wandb_callbacks.UploadCheckpointsAsArtifact', 'ckpt_dir': 'checkpoints/', 'upload_best_only': True}, 'log_text_predictions': {'_target_': 'callbacks.wandb_callbacks.LogTextPredictions', 'num_samples': 8}}"
+       "torch.Size([1, 64, 144, 160])"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "cfg.get(\"callbacks\")"
+    "network.encode(x)[0].shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "216d5680-66bf-4190-9401-1a59dbbc43af",
+   "execution_count": 38,
+   "id": "a7b9f249-7e5e-4f31-bbe1-cfd6d3701cf0",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "pytorch_lightning.callbacks.ModelCheckpoint\n",
-      "pytorch_lightning.callbacks.LearningRateMonitor\n",
-      "callbacks.wandb_callbacks.WatchModel\n",
-      "callbacks.wandb_callbacks.UploadCodeAsArtifact\n",
-      "callbacks.wandb_callbacks.UploadCheckpointsAsArtifact\n",
-      "callbacks.wandb_callbacks.LogTextPredictions\n"
+      "torch.Size([512])\n",
+      "torch.Size([512])\n",
+      "torch.Size([512])\n",
+      "torch.Size([512])\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 1, 576, 640])"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "for l in cfg.callbacks.values():\n",
-    "    print(l.get(\"_target_\"))"
+    "network(x)[0].shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "c1a9aa6b-6405-4ffe-b065-02340762476a",
+   "execution_count": null,
+   "id": "23c9d90c-042b-423e-ab85-18449e29ded4",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2021-08-03 15:27:02.069 | DEBUG    | text_recognizer.data.word_piece_mapping:__init__:37 - Using data dir: /home/aktersnurra/projects/text-recognizer/data/downloaded/iam/iamdb\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "mapping = instantiate(cfg.mapping)"
+    "576 / 4"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "969ba3be-d78f-4b1e-b522-ea8a42669e86",
+   "execution_count": null,
+   "id": "047ebc09-1c74-44a7-a314-1099f09722fe",
    "metadata": {},
    "outputs": [],
    "source": [
-    "network = instantiate(cfg.network)"
+    "t = torch.randint(0, 1006, (1, 451)).cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87372dde-2b1a-432b-ab79-0b116124c724",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "z = torch.rand((1, 36 * 40, 128)).cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf7ca9bf-cafa-4128-9db7-046c16933a52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "network = network.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfceaa5f-9ad8-4d33-addb-c56e8da48356",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "network.decode(z, t).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9105fbbb-4363-4d3e-a01e-bc519c3b9c3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "decoder = decoder.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5797ec4-7a6a-46fd-8adc-265df44d0341",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "decoder(z, t).shape"
    ]
   },
   {
@@ -368,11 +358,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "a6fae1fa-492d-4648-80fd-1c0dac659b02",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "datamodule = instantiate(cfg.datamodule, mapping=mapping)"
@@ -380,19 +368,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "514053ef-fcac-4f3c-a7c8-72c6927d6798",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2021-08-03 15:28:22.541 | INFO     | text_recognizer.data.iam_paragraphs:setup:95 - Loading IAM paragraph regions and lines for None...\n",
-      "2021-08-03 15:28:45.280 | INFO     | text_recognizer.data.iam_synthetic_paragraphs:setup:68 - IAM Synthetic dataset steup for stage None...\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "datamodule.prepare_data()\n",
     "datamodule.setup()"
@@ -400,21 +379,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "4bad950b-a197-4c60-ad89-903124659a98",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "4992"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "len(datamodule.train_dataloader())"
    ]
@@ -431,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "f6e01c15-9a1b-4036-87ae-78716c592264",
    "metadata": {},
    "outputs": [],
@@ -441,7 +409,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "4dc475fc-31f4-487e-88c8-b0f445131f5b",
    "metadata": {},
    "outputs": [],
@@ -451,7 +419,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "c5c8ed64-d98c-47b5-baf2-1ba57a6c882f",
    "metadata": {},
    "outputs": [],
@@ -461,11 +429,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "b5ff5b24-f804-402b-a8ab-f366443025ca",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "    model = hydra.utils.instantiate(\n",
@@ -481,21 +447,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "99f8a39f-8b10-4f7d-8bff-52794fd48717",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<bound method WordPieceMapping.get_index of <text_recognizer.data.word_piece_mapping.WordPieceMapping object at 0x7fae3b489610>>"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "mapping.get_index"
    ]
@@ -514,9 +469,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "8f0742ad-5e2f-42d5-83e7-6e46398b4f0f",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "net"
diff --git a/text_recognizer/models/vqvae.py b/text_recognizer/models/vqvae.py
index 22da018..5890fd9 100644
--- a/text_recognizer/models/vqvae.py
+++ b/text_recognizer/models/vqvae.py
@@ -14,31 +14,33 @@ from text_recognizer.models.base import BaseLitModel
 class VQVAELitModel(BaseLitModel):
     """A PyTorch Lightning model for transformer networks."""
 
+    latent_loss_weight: float = attr.ib(default=0.25)
+
     def forward(self, data: Tensor) -> Tensor:
         """Forward pass with the transformer network."""
-        return self.network.predict(data)
+        return self.network(data)
 
     def training_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> Tensor:
         """Training step."""
         data, _ = batch
-        reconstructions, vq_loss = self.network(data)
+        reconstructions, vq_loss = self(data)
         loss = self.loss_fn(reconstructions, data)
-        loss += vq_loss
+        loss += self.latent_loss_weight * vq_loss
         self.log("train/loss", loss)
         return loss
 
     def validation_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
         """Validation step."""
         data, _ = batch
-        reconstructions, vq_loss = self.network(data)
+        reconstructions, vq_loss = self(data)
         loss = self.loss_fn(reconstructions, data)
-        loss += vq_loss
+        loss += self.latent_loss_weight * vq_loss
         self.log("val/loss", loss, prog_bar=True)
 
     def test_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
         """Test step."""
         data, _ = batch
-        reconstructions, vq_loss = self.network(data)
+        reconstructions, vq_loss = self(data)
         loss = self.loss_fn(reconstructions, data)
-        loss += vq_loss
+        loss += self.latent_loss_weight * vq_loss
         self.log("test/loss", loss)
diff --git a/text_recognizer/networks/vq_transformer.py b/text_recognizer/networks/vq_transformer.py
new file mode 100644
index 0000000..a972565
--- /dev/null
+++ b/text_recognizer/networks/vq_transformer.py
@@ -0,0 +1,77 @@
+"""Vector quantized encoder, transformer decoder."""
+import math
+from typing import Tuple
+
+from torch import nn, Tensor
+
+from text_recognizer.networks.encoders.efficientnet import EfficientNet
+from text_recognizer.networks.conv_transformer import ConvTransformer
+from text_recognizer.networks.transformer.layers import Decoder
+from text_recognizer.networks.transformer.positional_encodings import (
+    PositionalEncoding,
+    PositionalEncoding2D,
+)
+
+
+class VqTransformer(ConvTransformer):
+    """Convolutional encoder and transformer decoder network."""
+
+    def __init__(
+        self,
+        input_dims: Tuple[int, int, int],
+        hidden_dim: int,
+        dropout_rate: float,
+        num_classes: int,
+        pad_index: Tensor,
+        encoder: EfficientNet,
+        decoder: Decoder,
+    ) -> None:
+        # TODO: Load pretrained vqvae encoder.
+        super().__init__(
+            input_dims=input_dims,
+            hidden_dim=hidden_dim,
+            dropout_rate=dropout_rate,
+            num_classes=num_classes,
+            pad_index=pad_index,
+            encoder=encoder,
+            decoder=decoder,
+        )
+        # Latent projector for down sampling number of filters and 2d
+        # positional encoding.
+        self.latent_encoder = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.encoder.out_channels,
+                out_channels=self.hidden_dim,
+                kernel_size=1,
+            ),
+            PositionalEncoding2D(
+                hidden_dim=self.hidden_dim,
+                max_h=self.input_dims[1],
+                max_w=self.input_dims[2],
+            ),
+            nn.Flatten(start_dim=2),
+        )
+
+    def encode(self, x: Tensor) -> Tensor:
+        """Encodes an image into a latent feature vector.
+
+        Args:
+            x (Tensor): Image tensor.
+
+        Shape:
+            - x: :math: `(B, C, H, W)`
+            - z: :math: `(B, Sx, E)`
+
+            where Sx is the length of the flattened feature maps projected from
+            the encoder. E latent dimension for each pixel in the projected
+            feature maps.
+
+        Returns:
+            Tensor: A Latent embedding of the image.
+        """
+        z = self.encoder(x)
+        z = self.latent_encoder(z)
+
+        # Permute tensor from [B, E, Ho * Wo] to [B, Sx, E]
+        z = z.permute(0, 2, 1)
+        return z
diff --git a/text_recognizer/networks/vqvae/__init__.py b/text_recognizer/networks/vqvae/__init__.py
index 763953c..7d56bdb 100644
--- a/text_recognizer/networks/vqvae/__init__.py
+++ b/text_recognizer/networks/vqvae/__init__.py
@@ -1,5 +1,2 @@
 """VQ-VAE module."""
-from .decoder import Decoder
-from .encoder import Encoder
-from .vector_quantizer import VectorQuantizer
 from .vqvae import VQVAE
diff --git a/text_recognizer/networks/vqvae/decoder.py b/text_recognizer/networks/vqvae/decoder.py
index 32de912..3f59f0d 100644
--- a/text_recognizer/networks/vqvae/decoder.py
+++ b/text_recognizer/networks/vqvae/decoder.py
@@ -1,133 +1,65 @@
 """CNN decoder for the VQ-VAE."""
-
-from typing import List, Optional, Tuple, Type
-
-import torch
+import attr
 from torch import nn
 from torch import Tensor
 
 from text_recognizer.networks.util import activation_function
-from text_recognizer.networks.vqvae.encoder import _ResidualBlock
+from text_recognizer.networks.vqvae.residual import Residual
 
 
+@attr.s(eq=False)
 class Decoder(nn.Module):
     """A CNN encoder network."""
 
-    def __init__(
-        self,
-        channels: List[int],
-        kernel_sizes: List[int],
-        strides: List[int],
-        num_residual_layers: int,
-        embedding_dim: int,
-        upsampling: Optional[List[List[int]]] = None,
-        activation: str = "leaky_relu",
-        dropout_rate: float = 0.0,
-    ) -> None:
-        super().__init__()
-
-        if dropout_rate:
-            if activation == "selu":
-                dropout = nn.AlphaDropout(p=dropout_rate)
-            else:
-                dropout = nn.Dropout(p=dropout_rate)
-        else:
-            dropout = None
-
-        self.upsampling = upsampling
-
-        self.res_block = nn.ModuleList([])
-        self.upsampling_block = nn.ModuleList([])
-
-        self.embedding_dim = embedding_dim
-        activation = activation_function(activation)
-
-        # Configure encoder.
-        self.decoder = self._build_decoder(
-            channels, kernel_sizes, strides, num_residual_layers, activation, dropout,
-        )
-
-    def _build_decompression_block(
-        self,
-        in_channels: int,
-        channels: int,
-        kernel_sizes: List[int],
-        strides: List[int],
-        activation: Type[nn.Module],
-        dropout: Optional[Type[nn.Module]],
-    ) -> nn.ModuleList:
-        modules = nn.ModuleList([])
-        configuration = zip(channels, kernel_sizes, strides)
-        for i, (out_channels, kernel_size, stride) in enumerate(configuration):
-            modules.append(
-                nn.Sequential(
-                    nn.ConvTranspose2d(
-                        in_channels,
-                        out_channels,
-                        kernel_size,
-                        stride=stride,
-                        padding=1,
-                    ),
-                    activation,
-                )
-            )
-
-            if self.upsampling and i < len(self.upsampling):
-                modules.append(nn.Upsample(size=self.upsampling[i]),)
-
-            if dropout is not None:
-                modules.append(dropout)
-
-            in_channels = out_channels
-
-        modules.extend(
-            nn.Sequential(
-                nn.ConvTranspose2d(
-                    in_channels, 1, kernel_size=kernel_size, stride=stride, padding=1
-                ),
-                nn.Tanh(),
-            )
-        )
-
-        return modules
+    in_channels: int = attr.ib()
+    embedding_dim: int = attr.ib()
+    out_channels: int = attr.ib()
+    res_channels: int = attr.ib()
+    num_residual_layers: int = attr.ib()
+    activation: str = attr.ib()
+    decoder: nn.Sequential = attr.ib(init=False)
 
-    def _build_decoder(
-        self,
-        channels: int,
-        kernel_sizes: List[int],
-        strides: List[int],
-        num_residual_layers: int,
-        activation: Type[nn.Module],
-        dropout: Optional[Type[nn.Module]],
-    ) -> nn.Sequential:
+    def __attrs_pre_init__(self) -> None:
+        super().__init__()
 
-        self.res_block.append(
-            nn.Conv2d(self.embedding_dim, channels[0], kernel_size=1, stride=1,)
-        )
+    def __attrs_post_init__(self) -> None:
+        """Post init configuration."""
+        self.decoder = self._build_decompression_block()
 
-        # Bottleneck module.
-        self.res_block.extend(
-            nn.ModuleList(
-                [
-                    _ResidualBlock(channels[0], channels[0], dropout)
-                    for i in range(num_residual_layers)
-                ]
+    def _build_decompression_block(self,) -> nn.Sequential:
+        activation_fn = activation_function(self.activation)
+        blocks = [
+            nn.Conv2d(
+                in_channels=self.in_channels,
+                out_channels=self.embedding_dim,
+                kernel_size=3,
+                padding=1,
             )
-        )
-
-        # Decompression module
-        self.upsampling_block.extend(
-            self._build_decompression_block(
-                channels[0], channels[1:], kernel_sizes, strides, activation, dropout
+        ]
+        for _ in range(self.num_residual_layers):
+            blocks.append(
+                Residual(in_channels=self.embedding_dim, out_channels=self.res_channels)
             )
-        )
-
-        self.res_block = nn.Sequential(*self.res_block)
-        self.upsampling_block = nn.Sequential(*self.upsampling_block)
-
-        return nn.Sequential(self.res_block, self.upsampling_block)
+        blocks.append(activation_fn)
+        blocks += [
+            nn.ConvTranspose2d(
+                in_channels=self.embedding_dim,
+                out_channels=self.embedding_dim // 2,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+            ),
+            activation_fn,
+            nn.ConvTranspose2d(
+                in_channels=self.embedding_dim // 2,
+                out_channels=self.out_channels,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+            ),
+        ]
+        return nn.Sequential(*blocks)
 
     def forward(self, z_q: Tensor) -> Tensor:
         """Reconstruct input from given codes."""
-        x_reconstruction = self.decoder(z_q)
-        return x_reconstruction
+        return self.decoder(z_q)
diff --git a/text_recognizer/networks/vqvae/encoder.py b/text_recognizer/networks/vqvae/encoder.py
index 65801df..e480545 100644
--- a/text_recognizer/networks/vqvae/encoder.py
+++ b/text_recognizer/networks/vqvae/encoder.py
@@ -1,147 +1,75 @@
 """CNN encoder for the VQ-VAE."""
 from typing import Sequence, Optional, Tuple, Type
 
-import torch
+import attr
 from torch import nn
 from torch import Tensor
 
 from text_recognizer.networks.util import activation_function
-from text_recognizer.networks.vqvae.vector_quantizer import VectorQuantizer
-
-
-class _ResidualBlock(nn.Module):
-    def __init__(
-        self, in_channels: int, out_channels: int, dropout: Optional[Type[nn.Module]],
-    ) -> None:
-        super().__init__()
-        self.block = [
-            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(out_channels, out_channels, kernel_size=1, bias=False),
-        ]
-
-        if dropout is not None:
-            self.block.append(dropout)
-
-        self.block = nn.Sequential(*self.block)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Apply the residual forward pass."""
-        return x + self.block(x)
+from text_recognizer.networks.vqvae.residual import Residual
 
 
+@attr.s(eq=False)
 class Encoder(nn.Module):
     """A CNN encoder network."""
 
-    def __init__(
-        self,
-        in_channels: int,
-        channels: Sequence[int],
-        kernel_sizes: Sequence[int],
-        strides: Sequence[int],
-        num_residual_layers: int,
-        embedding_dim: int,
-        num_embeddings: int,
-        beta: float = 0.25,
-        activation: str = "leaky_relu",
-        dropout_rate: float = 0.0,
-    ) -> None:
-        super().__init__()
-
-        if dropout_rate:
-            if activation == "selu":
-                dropout = nn.AlphaDropout(p=dropout_rate)
-            else:
-                dropout = nn.Dropout(p=dropout_rate)
-        else:
-            dropout = None
-
-        self.embedding_dim = embedding_dim
-        self.num_embeddings = num_embeddings
-        self.beta = beta
-        activation = activation_function(activation)
-
-        # Configure encoder.
-        self.encoder = self._build_encoder(
-            in_channels,
-            channels,
-            kernel_sizes,
-            strides,
-            num_residual_layers,
-            activation,
-            dropout,
-        )
-
-        # Configure Vector Quantizer.
-        self.vector_quantizer = VectorQuantizer(
-            self.num_embeddings, self.embedding_dim, self.beta
-        )
-
-    @staticmethod
-    def _build_compression_block(
-        in_channels: int,
-        channels: int,
-        kernel_sizes: Sequence[int],
-        strides: Sequence[int],
-        activation: Type[nn.Module],
-        dropout: Optional[Type[nn.Module]],
-    ) -> nn.ModuleList:
-        modules = nn.ModuleList([])
-        configuration = zip(channels, kernel_sizes, strides)
-        for out_channels, kernel_size, stride in configuration:
-            modules.append(
-                nn.Sequential(
-                    nn.Conv2d(
-                        in_channels, out_channels, kernel_size, stride=stride, padding=1
-                    ),
-                    activation,
-                )
-            )
-
-            if dropout is not None:
-                modules.append(dropout)
+    in_channels: int = attr.ib()
+    out_channels: int = attr.ib()
+    res_channels: int = attr.ib()
+    num_residual_layers: int = attr.ib()
+    embedding_dim: int = attr.ib()
+    activation: str = attr.ib()
+    encoder: nn.Sequential = attr.ib(init=False)
 
-            in_channels = out_channels
+    def __attrs_pre_init__(self) -> None:
+        super().__init__()
 
-        return modules
+    def __attrs_post_init__(self) -> None:
+        """Post init configuration."""
+        self.encoder = self._build_compression_block()
 
-    def _build_encoder(
-        self,
-        in_channels: int,
-        channels: int,
-        kernel_sizes: Sequence[int],
-        strides: Sequence[int],
-        num_residual_layers: int,
-        activation: Type[nn.Module],
-        dropout: Optional[Type[nn.Module]],
-    ) -> nn.Sequential:
-        encoder = nn.ModuleList([])
+    def _build_compression_block(self) -> nn.Sequential:
+        activation_fn = activation_function(self.activation)
+        block = [
+            nn.Conv2d(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels // 2,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+            ),
+            activation_fn,
+            nn.Conv2d(
+                in_channels=self.out_channels // 2,
+                out_channels=self.out_channels,
+                kernel_size=4,
+                stride=2,
+                padding=1,
+            ),
+            activation_fn,
+            nn.Conv2d(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                padding=1,
+            ),
+        ]
 
-        # compression module
-        encoder.extend(
-            self._build_compression_block(
-                in_channels, channels, kernel_sizes, strides, activation, dropout
+        for _ in range(self.num_residual_layers):
+            block.append(
+                Residual(in_channels=self.out_channels, out_channels=self.res_channels)
             )
-        )
 
-        # Bottleneck module.
-        encoder.extend(
-            nn.ModuleList(
-                [
-                    _ResidualBlock(channels[-1], channels[-1], dropout)
-                    for i in range(num_residual_layers)
-                ]
+        block.append(
+            nn.Conv2d(
+                in_channels=self.out_channels,
+                out_channels=self.embedding_dim,
+                kernel_size=1,
             )
         )
 
-        encoder.append(
-            nn.Conv2d(channels[-1], self.embedding_dim, kernel_size=1, stride=1,)
-        )
-
-        return nn.Sequential(*encoder)
+        return nn.Sequential(*block)
 
     def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
         """Encodes input into a discrete representation."""
-        z_e = self.encoder(x)
-        z_q, vq_loss = self.vector_quantizer(z_e)
-        return z_q, vq_loss
+        return self.encoder(x)
diff --git a/text_recognizer/networks/vqvae/vector_quantizer.py b/text_recognizer/networks/vqvae/quantizer.py
index f92c7ee..5e0b602 100644
--- a/text_recognizer/networks/vqvae/vector_quantizer.py
+++ b/text_recognizer/networks/vqvae/quantizer.py
@@ -2,9 +2,7 @@
 
 Reference:
 https://github.com/AntixK/PyTorch-VAE/blob/master/models/vq_vae.py
-
 """
-
 from einops import rearrange
 import torch
 from torch import nn
@@ -12,21 +10,27 @@ from torch import Tensor
 from torch.nn import functional as F
 
 
+class EmbeddingEMA(nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int) -> None:
+        super().__init__()
+        weight = torch.zeros(num_embeddings, embedding_dim)
+        nn.init.kaiming_uniform_(weight, nonlinearity="linear")
+        self.register_buffer("weight", weight)
+        self.register_buffer("_cluster_size", torch.zeros(num_embeddings))
+        self.register_buffer("_weight_avg", weight)
+
+
 class VectorQuantizer(nn.Module):
     """The codebook that contains quantized vectors."""
 
     def __init__(
-        self, num_embeddings: int, embedding_dim: int, beta: float = 0.25
+        self, num_embeddings: int, embedding_dim: int, decay: float = 0.99
     ) -> None:
         super().__init__()
-        self.K = num_embeddings
-        self.D = embedding_dim
-        self.beta = beta
-
-        self.embedding = nn.Embedding(self.K, self.D)
-
-        # Initialize the codebook.
-        nn.init.uniform_(self.embedding.weight, -1 / self.K, 1 / self.K)
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.decay = decay
+        self.embedding = EmbeddingEMA(self.num_embeddings, self.embedding_dim)
 
     def discretization_bottleneck(self, latent: Tensor) -> Tensor:
         """Computes the code nearest to the latent representation.
@@ -62,7 +66,7 @@ class VectorQuantizer(nn.Module):
 
         # Convert to one-hot encodings, aka discrete bottleneck.
         one_hot_encoding = torch.zeros(
-            encoding_indices.shape[0], self.K, device=latent.device
+            encoding_indices.shape[0], self.num_embeddings, device=latent.device
         )
         one_hot_encoding.scatter_(1, encoding_indices, 1)  # [BHW x K]
 
@@ -71,9 +75,27 @@ class VectorQuantizer(nn.Module):
         quantized_latent = rearrange(
             quantized_latent, "(b h w) d -> b h w d", b=b, h=h, w=w
         )
+        if self.training:
+            self.compute_ema(one_hot_encoding=one_hot_encoding, latent=latent)
 
         return quantized_latent
 
+    def compute_ema(self, one_hot_encoding: Tensor, latent: Tensor) -> None:
+        batch_cluster_size = one_hot_encoding.sum(axis=0)
+        batch_embedding_avg = (latent.t() @ one_hot_encoding).t()
+        print(batch_cluster_size.shape)
+        print(self.embedding._cluster_size.shape)
+        self.embedding._cluster_size.data.mul_(self.decay).add_(
+            batch_cluster_size, alpha=1 - self.decay
+        )
+        self.embedding._weight_avg.data.mul_(self.decay).add_(
+            batch_embedding_avg, alpha=1 - self.decay
+        )
+        new_embedding = self.embedding._weight_avg / (
+            self.embedding._cluster_size + 1.0e-5
+        ).unsqueeze(1)
+        self.embedding.weight.data.copy_(new_embedding)
+
     def vq_loss(self, latent: Tensor, quantized_latent: Tensor) -> Tensor:
         """Vector Quantization loss.
 
@@ -96,9 +118,10 @@ class VectorQuantizer(nn.Module):
             Tensor: The combinded VQ loss.
 
         """
-        embedding_loss = F.mse_loss(quantized_latent, latent.detach())
         commitment_loss = F.mse_loss(quantized_latent.detach(), latent)
-        return embedding_loss + self.beta * commitment_loss
+        # embedding_loss = F.mse_loss(quantized_latent, latent.detach())
+        # return embedding_loss + self.beta * commitment_loss
+        return commitment_loss
 
     def forward(self, latent: Tensor) -> Tensor:
         """Forward pass that returns the quantized vector and the vq loss."""
diff --git a/text_recognizer/networks/vqvae/residual.py b/text_recognizer/networks/vqvae/residual.py
new file mode 100644
index 0000000..98109b8
--- /dev/null
+++ b/text_recognizer/networks/vqvae/residual.py
@@ -0,0 +1,18 @@
+"""Residual block."""
+from torch import nn
+from torch import Tensor
+
+
+class Residual(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int,) -> None:
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Mish(inplace=True),
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
+            nn.Mish(inplace=True),
+            nn.Conv2d(out_channels, in_channels, kernel_size=1, bias=False),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply the residual forward pass."""
+        return x + self.block(x)
diff --git a/text_recognizer/networks/vqvae/vqvae.py b/text_recognizer/networks/vqvae/vqvae.py
index 5aa929b..1585d40 100644
--- a/text_recognizer/networks/vqvae/vqvae.py
+++ b/text_recognizer/networks/vqvae/vqvae.py
@@ -1,10 +1,14 @@
 """The VQ-VAE."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Tuple
 
+import torch
 from torch import nn
 from torch import Tensor
+import torch.nn.functional as F
 
-from text_recognizer.networks.vqvae import Decoder, Encoder
+from text_recognizer.networks.vqvae.decoder import Decoder
+from text_recognizer.networks.vqvae.encoder import Encoder
+from text_recognizer.networks.vqvae.quantizer import VectorQuantizer
 
 
 class VQVAE(nn.Module):
@@ -13,62 +17,92 @@ class VQVAE(nn.Module):
     def __init__(
         self,
         in_channels: int,
-        channels: List[int],
-        kernel_sizes: List[int],
-        strides: List[int],
+        res_channels: int,
         num_residual_layers: int,
         embedding_dim: int,
         num_embeddings: int,
-        upsampling: Optional[List[List[int]]] = None,
-        beta: float = 0.25,
-        activation: str = "leaky_relu",
-        dropout_rate: float = 0.0,
-        *args: Any,
-        **kwargs: Dict,
+        decay: float = 0.99,
+        activation: str = "mish",
     ) -> None:
         super().__init__()
+        # Encoders
+        self.btm_encoder = Encoder(
+            in_channels=1,
+            out_channels=embedding_dim,
+            res_channels=res_channels,
+            num_residual_layers=num_residual_layers,
+            embedding_dim=embedding_dim,
+            activation=activation,
+        )
+
+        self.top_encoder = Encoder(
+            in_channels=embedding_dim,
+            out_channels=embedding_dim,
+            res_channels=res_channels,
+            num_residual_layers=num_residual_layers,
+            embedding_dim=embedding_dim,
+            activation=activation,
+        )
+
+        # Quantizers
+        self.btm_quantizer = VectorQuantizer(
+            num_embeddings=num_embeddings, embedding_dim=embedding_dim, decay=decay,
+        )
 
-        # configure encoder.
-        self.encoder = Encoder(
-            in_channels,
-            channels,
-            kernel_sizes,
-            strides,
-            num_residual_layers,
-            embedding_dim,
-            num_embeddings,
-            beta,
-            activation,
-            dropout_rate,
+        self.top_quantizer = VectorQuantizer(
+            num_embeddings=num_embeddings, embedding_dim=embedding_dim, decay=decay,
         )
 
-        # Configure decoder.
-        channels.reverse()
-        kernel_sizes.reverse()
-        strides.reverse()
-        self.decoder = Decoder(
-            channels,
-            kernel_sizes,
-            strides,
-            num_residual_layers,
-            embedding_dim,
-            upsampling,
-            activation,
-            dropout_rate,
+        # Decoders
+        self.top_decoder = Decoder(
+            in_channels=embedding_dim,
+            out_channels=embedding_dim,
+            embedding_dim=embedding_dim,
+            res_channels=res_channels,
+            num_residual_layers=num_residual_layers,
+            activation=activation,
+        )
+
+        self.btm_decoder = Decoder(
+            in_channels=2 * embedding_dim,
+            out_channels=in_channels,
+            embedding_dim=embedding_dim,
+            res_channels=res_channels,
+            num_residual_layers=num_residual_layers,
+            activation=activation,
         )
 
     def encode(self, x: Tensor) -> Tuple[Tensor, Tensor]:
         """Encodes input to a latent code."""
-        return self.encoder(x)
+        z_btm = self.btm_encoder(x)
+        z_top = self.top_encoder(z_btm)
+        return z_btm, z_top
+
+    def quantize(
+        self, z_btm: Tensor, z_top: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        q_btm, vq_btm_loss = self.top_quantizer(z_btm)
+        q_top, vq_top_loss = self.top_quantizer(z_top)
+        return q_btm, vq_btm_loss, q_top, vq_top_loss
 
-    def decode(self, z_q: Tensor) -> Tensor:
+    def decode(self, q_btm: Tensor, q_top: Tensor) -> Tuple[Tensor, Tensor]:
         """Reconstructs input from latent codes."""
-        return self.decoder(z_q)
+        d_top = self.top_decoder(q_top)
+        x_hat = self.btm_decoder(torch.cat((d_top, q_btm), dim=1))
+        return d_top, x_hat
+
+    def loss_fn(
+        self, vq_btm_loss: Tensor, vq_top_loss: Tensor, d_top: Tensor, z_btm: Tensor
+    ) -> Tensor:
+        """Calculates the latent loss."""
+        return 0.5 * (vq_top_loss + vq_btm_loss) + F.mse_loss(d_top, z_btm)
 
     def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
         """Compresses and decompresses input."""
-        if len(x.shape) < 4:
-            x = x[(None,) * (4 - len(x.shape))]
-        z_q, vq_loss = self.encode(x)
-        x_reconstruction = self.decode(z_q)
-        return x_reconstruction, vq_loss
+        z_btm, z_top = self.encode(x)
+        q_btm, vq_btm_loss, q_top, vq_top_loss = self.quantize(z_btm=z_btm, z_top=z_top)
+        d_top, x_hat = self.decode(q_btm=q_btm, q_top=q_top)
+        vq_loss = self.loss_fn(
+            vq_btm_loss=vq_btm_loss, vq_top_loss=vq_top_loss, d_top=d_top, z_btm=z_btm
+        )
+        return x_hat, vq_loss
diff --git a/training/callbacks/wandb_callbacks.py b/training/callbacks/wandb_callbacks.py
index 906531f..c750e4b 100644
--- a/training/callbacks/wandb_callbacks.py
+++ b/training/callbacks/wandb_callbacks.py
@@ -5,6 +5,7 @@ import wandb
 from pytorch_lightning import Callback, LightningModule, Trainer
 from pytorch_lightning.loggers import LoggerCollection, WandbLogger
 from pytorch_lightning.utilities import rank_zero_only
+from torch.utils.data import DataLoader
 
 
 def get_wandb_logger(trainer: Trainer) -> WandbLogger:
@@ -86,7 +87,11 @@ class LogTextPredictions(Callback):
         self.ready = False
 
     def _log_predictions(
-        self, stage: str, trainer: Trainer, pl_module: LightningModule
+        self,
+        stage: str,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        dataloader: DataLoader,
     ) -> None:
         """Logs the predicted text contained in the images."""
         if not self.ready:
@@ -96,22 +101,20 @@ class LogTextPredictions(Callback):
         experiment = logger.experiment
 
         # Get a validation batch from the validation dataloader.
-        samples = next(iter(trainer.datamodule.val_dataloader()))
+        samples = next(iter(dataloader))
         imgs, labels = samples
 
         imgs = imgs.to(device=pl_module.device)
         logits = pl_module(imgs)
 
         mapping = pl_module.mapping
-        columns = ["id", "image", "prediction", "truth"]
+        columns = ["image", "prediction", "truth"]
         data = [
-            [id, wandb.Image(img), mapping.get_text(pred), mapping.get_text(label)]
-            for id, (img, pred, label) in enumerate(
-                zip(
-                    imgs[: self.num_samples],
-                    logits[: self.num_samples],
-                    labels[: self.num_samples],
-                )
+            [wandb.Image(img), mapping.get_text(pred), mapping.get_text(label)]
+            for img, pred, label in zip(
+                imgs[: self.num_samples],
+                logits[: self.num_samples],
+                labels[: self.num_samples],
             )
         ]
 
@@ -133,11 +136,17 @@ class LogTextPredictions(Callback):
         self, trainer: Trainer, pl_module: LightningModule
     ) -> None:
         """Logs predictions on validation epoch end."""
-        self._log_predictions(stage="val", trainer=trainer, pl_module=pl_module)
+        dataloader = trainer.datamodule.val_dataloader()
+        self._log_predictions(
+            stage="val", trainer=trainer, pl_module=pl_module, dataloader=dataloader
+        )
 
     def on_test_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
         """Logs predictions on train epoch end."""
-        self._log_predictions(stage="test", trainer=trainer, pl_module=pl_module)
+        dataloader = trainer.datamodule.test_dataloader()
+        self._log_predictions(
+            stage="test", trainer=trainer, pl_module=pl_module, dataloader=dataloader
+        )
 
 
 class LogReconstuctedImages(Callback):
@@ -148,7 +157,11 @@ class LogReconstuctedImages(Callback):
         self.ready = False
 
     def _log_reconstruction(
-        self, stage: str, trainer: Trainer, pl_module: LightningModule
+        self,
+        stage: str,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        dataloader: DataLoader,
     ) -> None:
         """Logs the reconstructions."""
         if not self.ready:
@@ -158,20 +171,24 @@ class LogReconstuctedImages(Callback):
         experiment = logger.experiment
 
         # Get a validation batch from the validation dataloader.
-        samples = next(iter(trainer.datamodule.val_dataloader()))
+        samples = next(iter(dataloader))
         imgs, _ = samples
 
+        colums = ["input", "reconstruction"]
         imgs = imgs.to(device=pl_module.device)
-        reconstructions = pl_module(imgs)
+        reconstructions = pl_module(imgs)[0]
+        data = [
+            [wandb.Image(img), wandb.Image(rec)]
+            for img, rec in zip(
+                imgs[: self.num_samples], reconstructions[: self.num_samples]
+            )
+        ]
 
         experiment.log(
             {
-                f"Reconstructions/{experiment.name}/{stage}": [
-                    [wandb.Image(img), wandb.Image(rec),]
-                    for img, rec in zip(
-                        imgs[: self.num_samples], reconstructions[: self.num_samples],
-                    )
-                ]
+                f"Reconstructions/{experiment.name}/{stage}": wandb.Table(
+                    data=data, columns=colums
+                )
             }
         )
 
@@ -189,8 +206,14 @@ class LogReconstuctedImages(Callback):
         self, trainer: Trainer, pl_module: LightningModule
     ) -> None:
         """Logs predictions on validation epoch end."""
-        self._log_reconstruction(stage="val", trainer=trainer, pl_module=pl_module)
+        dataloader = trainer.datamodule.val_dataloader()
+        self._log_reconstruction(
+            stage="val", trainer=trainer, pl_module=pl_module, dataloader=dataloader
+        )
 
     def on_test_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
         """Logs predictions on train epoch end."""
-        self._log_reconstruction(stage="test", trainer=trainer, pl_module=pl_module)
+        dataloader = trainer.datamodule.test_dataloader()
+        self._log_reconstruction(
+            stage="test", trainer=trainer, pl_module=pl_module, dataloader=dataloader
+        )
diff --git a/training/conf/callbacks/wandb_image_reconstructions.yaml b/training/conf/callbacks/wandb_image_reconstructions.yaml
index e69de29..6cc4ada 100644
--- a/training/conf/callbacks/wandb_image_reconstructions.yaml
+++ b/training/conf/callbacks/wandb_image_reconstructions.yaml
@@ -0,0 +1,3 @@
+log_image_reconstruction:
+  _target_: callbacks.wandb_callbacks.LogReconstuctedImages
+  num_samples: 8
diff --git a/training/conf/callbacks/wandb_vae.yaml b/training/conf/callbacks/wandb_vae.yaml
new file mode 100644
index 0000000..609a8e8
--- /dev/null
+++ b/training/conf/callbacks/wandb_vae.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default
+  - wandb_watch
+  - wandb_code 
+  - wandb_checkpoints
+  - wandb_image_reconstructions
diff --git a/training/conf/config.yaml b/training/conf/config.yaml
index 782bcbb..6b74502 100644
--- a/training/conf/config.yaml
+++ b/training/conf/config.yaml
@@ -1,3 +1,5 @@
+# @package _global_
+
 defaults:
   - callbacks: wandb_ocr
   - criterion: label_smoothing
diff --git a/training/conf/experiment/vqvae.yaml b/training/conf/experiment/vqvae.yaml
new file mode 100644
index 0000000..13e5f34
--- /dev/null
+++ b/training/conf/experiment/vqvae.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /network: vqvae
+  - override /criterion: mse
+  - override /model: lit_vqvae
+  - override /callbacks: wandb_vae
+
+trainer:
+  max_epochs: 64
+
+datamodule:
+  batch_size: 32
+
+lr_scheduler:
+  epochs: 64
+  steps_per_epoch: 624
+
+optimizer:
+  lr: 1.0e-2
diff --git a/training/conf/experiment/vqvae_experiment.yaml b/training/conf/experiment/vqvae_experiment.yaml
deleted file mode 100644
index 0858c3d..0000000
--- a/training/conf/experiment/vqvae_experiment.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-defaults:
-  - override /network: vqvae
-  - override /criterion: mse
-  - override /optimizer: madgrad
-  - override /lr_scheduler: one_cycle
-  - override /model: lit_vqvae
-  - override /dataset: iam_extended_paragraphs
-  - override /trainer: default
-  - override /callbacks:
-    - wandb
-
-load_checkpoint: null 
-logging: INFO
diff --git a/training/conf/model/lit_vqvae.yaml b/training/conf/model/lit_vqvae.yaml
index b337fe6..8837573 100644
--- a/training/conf/model/lit_vqvae.yaml
+++ b/training/conf/model/lit_vqvae.yaml
@@ -1,2 +1,4 @@
 _target_: text_recognizer.models.vqvae.VQVAELitModel
-mapping: sentence_piece
+interval: step
+monitor: val/loss
+latent_loss_weight: 0.25
diff --git a/training/conf/network/conv_transformer.yaml b/training/conf/network/conv_transformer.yaml
index f76e892..d3a3b0f 100644
--- a/training/conf/network/conv_transformer.yaml
+++ b/training/conf/network/conv_transformer.yaml
@@ -4,7 +4,7 @@ defaults:
 
 _target_: text_recognizer.networks.conv_transformer.ConvTransformer
 input_dims: [1, 576, 640]
-hidden_dim: 96
+hidden_dim: 128
 dropout_rate: 0.2
 num_classes: 1006
 pad_index: 1002
diff --git a/training/conf/network/decoder/transformer_decoder.yaml b/training/conf/network/decoder/transformer_decoder.yaml
index eb80f64..c326c04 100644
--- a/training/conf/network/decoder/transformer_decoder.yaml
+++ b/training/conf/network/decoder/transformer_decoder.yaml
@@ -2,12 +2,12 @@ defaults:
   - rotary_emb: null
 
 _target_: text_recognizer.networks.transformer.Decoder
-dim: 96
+dim: 128
 depth: 2
 num_heads: 8
 attn_fn: text_recognizer.networks.transformer.attention.Attention
 attn_kwargs:
-  dim_head: 16
+  dim_head: 64
   dropout_rate: 0.2
 norm_fn: torch.nn.LayerNorm
 ff_fn: text_recognizer.networks.transformer.mlp.FeedForward
diff --git a/training/conf/network/vqvae.yaml b/training/conf/network/vqvae.yaml
index 22eebf8..5a5c066 100644
--- a/training/conf/network/vqvae.yaml
+++ b/training/conf/network/vqvae.yaml
@@ -1,13 +1,8 @@
-type: VQVAE
-args:
-  in_channels: 1
-  channels: [64, 96]
-  kernel_sizes: [4, 4]
-  strides: [2, 2]
-  num_residual_layers: 2 
-  embedding_dim: 64
-  num_embeddings: 256
-  upsampling: null
-  beta: 0.25
-  activation: leaky_relu
-  dropout_rate: 0.2
+_target_: text_recognizer.networks.vqvae.VQVAE
+in_channels: 1
+res_channels: 32
+num_residual_layers: 2
+embedding_dim: 64
+num_embeddings: 512
+decay: 0.99
+activation: mish