summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rw-r--r--notebooks/05c-test-model-end-to-end.ipynb59
-rw-r--r--text_recognizer/criterions/n_layer_discriminator.py58
-rw-r--r--text_recognizer/criterions/vqgan_loss.py85
-rw-r--r--text_recognizer/data/emnist_mapping.py6
-rw-r--r--text_recognizer/models/base.py81
-rw-r--r--text_recognizer/models/transformer.py4
-rw-r--r--text_recognizer/models/vqgan.py135
-rw-r--r--text_recognizer/models/vqvae.py12
-rw-r--r--text_recognizer/networks/vqvae/decoder.py13
-rw-r--r--text_recognizer/networks/vqvae/encoder.py11
-rw-r--r--text_recognizer/networks/vqvae/norm.py6
-rw-r--r--text_recognizer/networks/vqvae/pixelcnn.py12
-rw-r--r--text_recognizer/networks/vqvae/residual.py8
-rw-r--r--text_recognizer/networks/vqvae/vqvae.py1
-rw-r--r--training/conf/config.yaml12
-rw-r--r--training/conf/criterion/mae.yaml2
-rw-r--r--training/conf/criterion/vqgan_loss.yaml12
-rw-r--r--training/conf/experiment/vqgan.yaml55
-rw-r--r--training/conf/experiment/vqvae.yaml11
-rw-r--r--training/conf/experiment/vqvae_pixelcnn.yaml24
-rw-r--r--training/conf/lr_scheduler/cosine_annealing.yaml13
-rw-r--r--training/conf/lr_scheduler/one_cycle.yaml37
-rw-r--r--training/conf/network/decoder/pixelcnn_decoder.yaml (renamed from training/conf/network/encoder/pixelcnn_decoder.yaml)2
-rw-r--r--training/conf/network/decoder/vae_decoder.yaml2
-rw-r--r--training/conf/network/encoder/pixelcnn_encoder.yaml (renamed from training/conf/network/decoder/pixelcnn_encoder.yaml)2
-rw-r--r--training/conf/network/encoder/vae_encoder.yaml2
-rw-r--r--training/conf/network/vqvae.yaml2
-rw-r--r--training/conf/network/vqvae_pixelcnn.yaml2
-rw-r--r--training/conf/optimizer/madgrad.yaml13
-rw-r--r--training/run.py4
31 files changed, 575 insertions, 113 deletions
diff --git a/README.md b/README.md
index a304c84..45314a4 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,8 @@ python build-transitions --tokens iamdb_1kwp_tokens_1000.txt --lexicon iamdb_1kw
- [x] Efficient-net b0 + transformer decoder
- [x] Load everything with hydra, get it to work
- [x] Train network
+- [ ] Weight init
+- [ ] patchgan loss
- [ ] Get VQVAE2 to work and not get loss NAN
- [ ] Local attention for target sequence
- [ ] Rotary embedding for target sequence
diff --git a/notebooks/05c-test-model-end-to-end.ipynb b/notebooks/05c-test-model-end-to-end.ipynb
index b26a1fe..42621da 100644
--- a/notebooks/05c-test-model-end-to-end.ipynb
+++ b/notebooks/05c-test-model-end-to-end.ipynb
@@ -2,19 +2,10 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 1,
"id": "1e40a88b",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The autoreload extension is already loaded. To reload it, use:\n",
- " %reload_ext autoreload\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
@@ -34,6 +25,52 @@
},
{
"cell_type": "code",
+ "execution_count": 2,
+ "id": "f40fc669-829c-4de8-83ed-475fc6a0b8c1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class T:\n",
+ " def __init__(self):\n",
+ " self.network = nn.Linear(1, 1)\n",
+ " \n",
+ " def get(self):\n",
+ " return getattr(self, \"network\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "d2bedf96-5388-4c7a-a048-1b97041cbedc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t = T()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "a6fbe3be-2a9f-4050-a397-7ad982d6cd05",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<generator object Module.parameters at 0x7f29ad6d6120>"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "t.get().parameters()"
+ ]
+ },
+ {
+ "cell_type": "code",
"execution_count": 5,
"id": "38fb3d9d-a163-4b72-981f-f31b51be39f2",
"metadata": {},
diff --git a/text_recognizer/criterions/n_layer_discriminator.py b/text_recognizer/criterions/n_layer_discriminator.py
new file mode 100644
index 0000000..e5f8449
--- /dev/null
+++ b/text_recognizer/criterions/n_layer_discriminator.py
@@ -0,0 +1,58 @@
+"""Pix2pix discriminator loss."""
+from torch import nn, Tensor
+
+from text_recognizer.networks.vqvae.norm import Normalize
+
+
+class NLayerDiscriminator(nn.Module):
+ """Defines a PatchGAN discriminator loss in Pix2Pix."""
+
+ def __init__(
+ self, in_channels: int = 1, num_channels: int = 32, num_layers: int = 3
+ ) -> None:
+ super().__init__()
+ self.in_channels = in_channels
+ self.num_channels = num_channels
+ self.num_layers = num_layers
+ self.discriminator = self._build_discriminator()
+
+ def _build_discriminator(self) -> nn.Sequential:
+ """Builds discriminator."""
+ discriminator = [
+ nn.Conv2d(
+ in_channels=self.in_channels,
+ out_channels=self.num_channels,
+ kernel_size=4,
+ stride=2,
+ padding=1,
+ ),
+ nn.Mish(inplace=True),
+ ]
+ in_channels = self.num_channels
+ for n in range(1, self.num_layers):
+ discriminator += [
+ nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=in_channels * n,
+ kernel_size=4,
+ stride=2,
+ padding=1,
+ ),
+ Normalize(num_channels=in_channels * n),
+ nn.Mish(inplace=True),
+ ]
+ in_channels *= n
+
+ discriminator += [
+ nn.Conv2d(
+ in_channels=self.num_channels * (self.num_layers - 1),
+ out_channels=1,
+ kernel_size=4,
+ padding=1,
+ )
+ ]
+ return nn.Sequential(*discriminator)
+
+ def forward(self, x: Tensor) -> Tensor:
+ """Forward pass through discriminator."""
+ return self.discriminator(x)
diff --git a/text_recognizer/criterions/vqgan_loss.py b/text_recognizer/criterions/vqgan_loss.py
new file mode 100644
index 0000000..8bb568f
--- /dev/null
+++ b/text_recognizer/criterions/vqgan_loss.py
@@ -0,0 +1,85 @@
+"""VQGAN loss for PyTorch Lightning."""
+from typing import Dict
+from click.types import Tuple
+
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+
+from text_recognizer.criterions.n_layer_discriminator import NLayerDiscriminator
+
+
+class VQGANLoss(nn.Module):
+ """VQGAN loss."""
+
+ def __init__(
+ self,
+ reconstruction_loss: nn.L1Loss,
+ discriminator: NLayerDiscriminator,
+ vq_loss_weight: float = 1.0,
+ discriminator_weight: float = 1.0,
+ ) -> None:
+ super().__init__()
+ self.reconstruction_loss = reconstruction_loss
+ self.discriminator = discriminator
+ self.vq_loss_weight = vq_loss_weight
+ self.discriminator_weight = discriminator_weight
+
+ @staticmethod
+ def adversarial_loss(logits_real: Tensor, logits_fake: Tensor) -> Tensor:
+ """Calculates the adversarial loss."""
+ loss_real = torch.mean(F.relu(1.0 - logits_real))
+ loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+ d_loss = (loss_real + loss_fake) / 2.0
+ return d_loss
+
+ def forward(
+ self,
+ data: Tensor,
+ reconstructions: Tensor,
+ vq_loss: Tensor,
+ optimizer_idx: int,
+ stage: str,
+ ) -> Tuple[Tensor, Dict[str, Tensor]]:
+ """Calculates the VQGAN loss."""
+ rec_loss = self.reconstruction_loss(
+ data.contiguous(), reconstructions.contiguous()
+ )
+
+ # GAN part.
+ if optimizer_idx == 0:
+ logits_fake = self.discriminator(reconstructions.contiguous())
+ g_loss = -torch.mean(logits_fake)
+
+ loss = (
+ rec_loss
+ + self.discriminator_weight * g_loss
+ + self.vq_loss_weight * vq_loss
+ )
+ log = {
+ f"{stage}/loss": loss,
+ f"{stage}/vq_loss": vq_loss,
+ f"{stage}/rec_loss": rec_loss,
+ f"{stage}/g_loss": g_loss,
+ }
+ return loss, log
+
+ if optimizer_idx == 1:
+ logits_fake = self.discriminator(reconstructions.contiguous().detach())
+ logits_real = self.discriminator(data.contiguous().detach())
+
+ d_loss = self.adversarial_loss(
+ logits_real=logits_real, logits_fake=logits_fake
+ )
+ loss = (
+ rec_loss
+ + self.discriminator_weight * d_loss
+ + self.vq_loss_weight * vq_loss
+ )
+ log = {
+ f"{stage}/loss": loss,
+ f"{stage}/vq_loss": vq_loss,
+ f"{stage}/rec_loss": rec_loss,
+ f"{stage}/d_loss": d_loss,
+ }
+ return loss, log
diff --git a/text_recognizer/data/emnist_mapping.py b/text_recognizer/data/emnist_mapping.py
index 3e91594..4406db7 100644
--- a/text_recognizer/data/emnist_mapping.py
+++ b/text_recognizer/data/emnist_mapping.py
@@ -9,7 +9,9 @@ from text_recognizer.data.emnist import emnist_mapping
class EmnistMapping(AbstractMapping):
- def __init__(self, extra_symbols: Optional[Set[str]] = None, lower: bool = True) -> None:
+ def __init__(
+ self, extra_symbols: Optional[Set[str]] = None, lower: bool = True
+ ) -> None:
self.extra_symbols = set(extra_symbols) if extra_symbols is not None else None
self.mapping, self.inverse_mapping, self.input_size = emnist_mapping(
self.extra_symbols
@@ -20,10 +22,12 @@ class EmnistMapping(AbstractMapping):
def _to_lower(self) -> None:
"""Converts mapping to lowercase letters only."""
+
def _filter(x: int) -> int:
if 40 <= x:
return x - 26
return x
+
self.inverse_mapping = {v: _filter(k) for k, v in enumerate(self.mapping)}
self.mapping = [c for c in self.mapping if not c.isupper()]
diff --git a/text_recognizer/models/base.py b/text_recognizer/models/base.py
index ab3fa35..8b68ed9 100644
--- a/text_recognizer/models/base.py
+++ b/text_recognizer/models/base.py
@@ -24,8 +24,8 @@ class BaseLitModel(LightningModule):
network: Type[nn.Module] = attr.ib()
mapping: Type[AbstractMapping] = attr.ib()
loss_fn: Type[nn.Module] = attr.ib()
- optimizer_config: DictConfig = attr.ib()
- lr_scheduler_config: DictConfig = attr.ib()
+ optimizer_configs: DictConfig = attr.ib()
+ lr_scheduler_configs: DictConfig = attr.ib()
train_acc: torchmetrics.Accuracy = attr.ib(
init=False, default=torchmetrics.Accuracy()
)
@@ -45,40 +45,55 @@ class BaseLitModel(LightningModule):
) -> None:
optimizer.zero_grad(set_to_none=True)
- def _configure_optimizer(self) -> Type[torch.optim.Optimizer]:
+ def _configure_optimizer(self) -> List[Type[torch.optim.Optimizer]]:
"""Configures the optimizer."""
- log.info(f"Instantiating optimizer <{self.optimizer_config._target_}>")
- return hydra.utils.instantiate(
- self.optimizer_config, params=self.network.parameters()
- )
-
- def _configure_lr_scheduler(
- self, optimizer: Type[torch.optim.Optimizer]
- ) -> Dict[str, Any]:
+ optimizers = []
+ for optimizer_config in self.optimizer_configs.values():
+ network = getattr(self, optimizer_config.parameters)
+ del optimizer_config.parameters
+ log.info(f"Instantiating optimizer <{optimizer_config._target_}>")
+ optimizers.append(
+ hydra.utils.instantiate(
+ self.optimizer_config, params=network.parameters()
+ )
+ )
+ return optimizers
+
+ def _configure_lr_schedulers(
+ self, optimizers: List[Type[torch.optim.Optimizer]]
+ ) -> List[Dict[str, Any]]:
"""Configures the lr scheduler."""
- # Extract non-class arguments.
- monitor = self.lr_scheduler_config.monitor
- interval = self.lr_scheduler_config.interval
- del self.lr_scheduler_config.monitor
- del self.lr_scheduler_config.interval
-
- log.info(
- f"Instantiating learning rate scheduler <{self.lr_scheduler_config._target_}>"
- )
- scheduler = {
- "monitor": monitor,
- "interval": interval,
- "scheduler": hydra.utils.instantiate(
- self.lr_scheduler_config, optimizer=optimizer
- ),
- }
- return scheduler
-
- def configure_optimizers(self) -> Tuple[List[type], List[Dict[str, Any]]]:
+ schedulers = []
+ for optimizer, lr_scheduler_config in zip(
+ optimizers, self.lr_scheduler_configs.values()
+ ):
+ # Extract non-class arguments.
+ monitor = lr_scheduler_config.monitor
+ interval = lr_scheduler_config.interval
+ del lr_scheduler_config.monitor
+ del lr_scheduler_config.interval
+
+ log.info(
+ f"Instantiating learning rate scheduler <{lr_scheduler_config._target_}>"
+ )
+ scheduler = {
+ "monitor": monitor,
+ "interval": interval,
+ "scheduler": hydra.utils.instantiate(
+ lr_scheduler_config, optimizer=optimizer
+ ),
+ }
+ schedulers.append(scheduler)
+
+ return schedulers
+
+ def configure_optimizers(
+ self,
+ ) -> Tuple[List[Type[torch.optim.Optimizer]], List[Dict[str, Any]]]:
"""Configures optimizer and lr scheduler."""
- optimizer = self._configure_optimizer()
- scheduler = self._configure_lr_scheduler(optimizer)
- return [optimizer], [scheduler]
+ optimizers = self._configure_optimizer()
+ schedulers = self._configure_lr_scheduler(optimizers)
+ return optimizers, schedulers
def forward(self, data: Tensor) -> Tensor:
"""Feedforward pass."""
diff --git a/text_recognizer/models/transformer.py b/text_recognizer/models/transformer.py
index 5fb84a7..75f7523 100644
--- a/text_recognizer/models/transformer.py
+++ b/text_recognizer/models/transformer.py
@@ -60,6 +60,8 @@ class TransformerLitModel(BaseLitModel):
pred = self(data)
self.val_cer(pred, targets)
self.log("val/cer", self.val_cer, on_step=False, on_epoch=True, prog_bar=True)
+ self.test_acc(pred, targets)
+ self.log("val/acc", self.test_acc, on_step=False, on_epoch=True)
def test_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
"""Test step."""
@@ -69,6 +71,8 @@ class TransformerLitModel(BaseLitModel):
pred = self(data)
self.test_cer(pred, targets)
self.log("test/cer", self.test_cer, on_step=False, on_epoch=True, prog_bar=True)
+ self.test_acc(pred, targets)
+ self.log("test/acc", self.test_acc, on_step=False, on_epoch=True)
def predict(self, x: Tensor) -> Tensor:
"""Predicts text in image.
diff --git a/text_recognizer/models/vqgan.py b/text_recognizer/models/vqgan.py
new file mode 100644
index 0000000..8ff65cc
--- /dev/null
+++ b/text_recognizer/models/vqgan.py
@@ -0,0 +1,135 @@
+"""PyTorch Lightning model for base Transformers."""
+from typing import Tuple
+
+import attr
+from torch import Tensor
+
+from text_recognizer.models.base import BaseLitModel
+from text_recognizer.criterions.vqgan_loss import VQGANLoss
+
+
+@attr.s(auto_attribs=True, eq=False)
+class VQVAELitModel(BaseLitModel):
+ """A PyTorch Lightning model for transformer networks."""
+
+ loss_fn: VQGANLoss = attr.ib()
+ latent_loss_weight: float = attr.ib(default=0.25)
+
+ def forward(self, data: Tensor) -> Tensor:
+ """Forward pass with the transformer network."""
+ return self.network(data)
+
+ def training_step(
+ self, batch: Tuple[Tensor, Tensor], batch_idx: int, optimizer_idx: int
+ ) -> Tensor:
+ """Training step."""
+ data, _ = batch
+
+ reconstructions, vq_loss = self(data)
+ loss = self.loss_fn(reconstructions, data)
+
+ if optimizer_idx == 0:
+ loss, log = self.loss_fn(
+ data=data,
+ reconstructions=reconstructions,
+ vq_loss=vq_loss,
+ optimizer_idx=optimizer_idx,
+ stage="train",
+ )
+ self.log(
+ "train/loss",
+ loss,
+ prog_bar=True,
+ logger=True,
+ on_step=True,
+ on_epoch=True,
+ )
+ self.log_dict(log, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+ return loss
+
+ if optimizer_idx == 1:
+ loss, log = self.loss_fn(
+ data=data,
+ reconstructions=reconstructions,
+ vq_loss=vq_loss,
+ optimizer_idx=optimizer_idx,
+ stage="train",
+ )
+ self.log(
+ "train/discriminator_loss",
+ loss,
+ prog_bar=True,
+ logger=True,
+ on_step=True,
+ on_epoch=True,
+ )
+ self.log_dict(log, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+ return loss
+
+ def validation_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
+ """Validation step."""
+ data, _ = batch
+ reconstructions, vq_loss = self(data)
+
+ loss, log = self.loss_fn(
+ data=data,
+ reconstructions=reconstructions,
+ vq_loss=vq_loss,
+ optimizer_idx=0,
+ stage="val",
+ )
+ self.log(
+ "val/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True
+ )
+ self.log(
+ "val/rec_loss",
+ log["val/rec_loss"],
+ prog_bar=True,
+ logger=True,
+ on_step=True,
+ on_epoch=True,
+ )
+ self.log_dict(log)
+
+ _, log = self.loss_fn(
+ data=data,
+ reconstructions=reconstructions,
+ vq_loss=vq_loss,
+ optimizer_idx=1,
+ stage="val",
+ )
+ self.log_dict(log)
+
+ def test_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
+ """Test step."""
+ data, _ = batch
+ reconstructions, vq_loss = self(data)
+
+ loss, log = self.loss_fn(
+ data=data,
+ reconstructions=reconstructions,
+ vq_loss=vq_loss,
+ optimizer_idx=0,
+ stage="test",
+ )
+ self.log(
+ "test/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True
+ )
+ self.log(
+ "test/rec_loss",
+ log["test/rec_loss"],
+ prog_bar=True,
+ logger=True,
+ on_step=True,
+ on_epoch=True,
+ )
+ self.log_dict(log)
+
+ _, log = self.loss_fn(
+ data=data,
+ reconstructions=reconstructions,
+ vq_loss=vq_loss,
+ optimizer_idx=1,
+ stage="test",
+ )
+ self.log_dict(log)
diff --git a/text_recognizer/models/vqvae.py b/text_recognizer/models/vqvae.py
index ef9a59a..56229b3 100644
--- a/text_recognizer/models/vqvae.py
+++ b/text_recognizer/models/vqvae.py
@@ -28,8 +28,8 @@ class VQVAELitModel(BaseLitModel):
self.log("train/vq_loss", vq_loss)
self.log("train/loss", loss)
- self.train_acc(reconstructions, data)
- self.log("train/acc", self.train_acc, on_step=False, on_epoch=True)
+ # self.train_acc(reconstructions, data)
+ # self.log("train/acc", self.train_acc, on_step=False, on_epoch=True)
return loss
def validation_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
@@ -42,8 +42,8 @@ class VQVAELitModel(BaseLitModel):
self.log("val/vq_loss", vq_loss)
self.log("val/loss", loss, prog_bar=True)
- self.val_acc(reconstructions, data)
- self.log("val/acc", self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
+ # self.val_acc(reconstructions, data)
+ # self.log("val/acc", self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
def test_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
"""Test step."""
@@ -53,5 +53,5 @@ class VQVAELitModel(BaseLitModel):
loss = loss + self.latent_loss_weight * vq_loss
self.log("test/vq_loss", vq_loss)
self.log("test/loss", loss)
- self.test_acc(reconstructions, data)
- self.log("test/acc", self.test_acc, on_step=False, on_epoch=True)
+ # self.test_acc(reconstructions, data)
+ # self.log("test/acc", self.test_acc, on_step=False, on_epoch=True)
diff --git a/text_recognizer/networks/vqvae/decoder.py b/text_recognizer/networks/vqvae/decoder.py
index f51e0a3..fcbed57 100644
--- a/text_recognizer/networks/vqvae/decoder.py
+++ b/text_recognizer/networks/vqvae/decoder.py
@@ -12,7 +12,14 @@ from text_recognizer.networks.vqvae.residual import Residual
class Decoder(nn.Module):
"""A CNN encoder network."""
- def __init__(self, out_channels: int, hidden_dim: int, channels_multipliers: Sequence[int], dropout_rate: float, activation: str = "mish") -> None:
+ def __init__(
+ self,
+ out_channels: int,
+ hidden_dim: int,
+ channels_multipliers: Sequence[int],
+ dropout_rate: float,
+ activation: str = "mish",
+ ) -> None:
super().__init__()
self.out_channels = out_channels
self.hidden_dim = hidden_dim
@@ -33,9 +40,9 @@ class Decoder(nn.Module):
use_norm=True,
),
]
-
+
activation_fn = activation_function(self.activation)
- out_channels_multipliers = self.channels_multipliers + (1, )
+ out_channels_multipliers = self.channels_multipliers + (1,)
num_blocks = len(self.channels_multipliers)
for i in range(num_blocks):
diff --git a/text_recognizer/networks/vqvae/encoder.py b/text_recognizer/networks/vqvae/encoder.py
index ad8f950..4a5c976 100644
--- a/text_recognizer/networks/vqvae/encoder.py
+++ b/text_recognizer/networks/vqvae/encoder.py
@@ -11,7 +11,14 @@ from text_recognizer.networks.vqvae.residual import Residual
class Encoder(nn.Module):
"""A CNN encoder network."""
- def __init__(self, in_channels: int, hidden_dim: int, channels_multipliers: List[int], dropout_rate: float, activation: str = "mish") -> None:
+ def __init__(
+ self,
+ in_channels: int,
+ hidden_dim: int,
+ channels_multipliers: List[int],
+ dropout_rate: float,
+ activation: str = "mish",
+ ) -> None:
super().__init__()
self.in_channels = in_channels
self.hidden_dim = hidden_dim
@@ -33,7 +40,7 @@ class Encoder(nn.Module):
]
num_blocks = len(self.channels_multipliers)
- channels_multipliers = (1, ) + self.channels_multipliers
+ channels_multipliers = (1,) + self.channels_multipliers
activation_fn = activation_function(self.activation)
for i in range(num_blocks):
diff --git a/text_recognizer/networks/vqvae/norm.py b/text_recognizer/networks/vqvae/norm.py
index 3e6963a..d73f9f8 100644
--- a/text_recognizer/networks/vqvae/norm.py
+++ b/text_recognizer/networks/vqvae/norm.py
@@ -6,13 +6,17 @@ from torch import nn, Tensor
@attr.s(eq=False)
class Normalize(nn.Module):
num_channels: int = attr.ib()
+ num_groups: int = attr.ib(default=32)
norm: nn.GroupNorm = attr.ib(init=False)
def __attrs_post_init__(self) -> None:
"""Post init configuration."""
super().__init__()
self.norm = nn.GroupNorm(
- num_groups=self.num_channels, num_channels=self.num_channels, eps=1.0e-6, affine=True
+ num_groups=self.num_groups,
+ num_channels=self.num_channels,
+ eps=1.0e-6,
+ affine=True,
)
def forward(self, x: Tensor) -> Tensor:
diff --git a/text_recognizer/networks/vqvae/pixelcnn.py b/text_recognizer/networks/vqvae/pixelcnn.py
index 5c580df..b9e6080 100644
--- a/text_recognizer/networks/vqvae/pixelcnn.py
+++ b/text_recognizer/networks/vqvae/pixelcnn.py
@@ -44,7 +44,7 @@ class Encoder(nn.Module):
),
]
num_blocks = len(self.channels_multipliers)
- in_channels_multipliers = (1,) + self.channels_multipliers
+ in_channels_multipliers = (1,) + self.channels_multipliers
for i in range(num_blocks):
in_channels = self.hidden_dim * in_channels_multipliers[i]
out_channels = self.hidden_dim * self.channels_multipliers[i]
@@ -68,7 +68,7 @@ class Encoder(nn.Module):
dropout_rate=self.dropout_rate,
use_norm=True,
),
- Attention(in_channels=self.hidden_dim * self.channels_multipliers[-1])
+ Attention(in_channels=self.hidden_dim * self.channels_multipliers[-1]),
]
encoder += [
@@ -125,7 +125,7 @@ class Decoder(nn.Module):
),
]
- out_channels_multipliers = self.channels_multipliers + (1, )
+ out_channels_multipliers = self.channels_multipliers + (1,)
num_blocks = len(self.channels_multipliers)
for i in range(num_blocks):
@@ -140,11 +140,7 @@ class Decoder(nn.Module):
)
)
if i == 0:
- decoder.append(
- Attention(
- in_channels=out_channels
- )
- )
+ decoder.append(Attention(in_channels=out_channels))
decoder.append(Upsample())
decoder += [
diff --git a/text_recognizer/networks/vqvae/residual.py b/text_recognizer/networks/vqvae/residual.py
index 4ed3781..46b091d 100644
--- a/text_recognizer/networks/vqvae/residual.py
+++ b/text_recognizer/networks/vqvae/residual.py
@@ -18,7 +18,13 @@ class Residual(nn.Module):
super().__init__()
self.block = self._build_res_block()
if self.in_channels != self.out_channels:
- self.conv_shortcut = nn.Conv2d(in_channels=self.in_channels, out_channels=self.out_channels, kernel_size=3, stride=1, padding=1)
+ self.conv_shortcut = nn.Conv2d(
+ in_channels=self.in_channels,
+ out_channels=self.out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ )
else:
self.conv_shortcut = None
diff --git a/text_recognizer/networks/vqvae/vqvae.py b/text_recognizer/networks/vqvae/vqvae.py
index 0646119..e8660c4 100644
--- a/text_recognizer/networks/vqvae/vqvae.py
+++ b/text_recognizer/networks/vqvae/vqvae.py
@@ -32,7 +32,6 @@ class VQVAE(nn.Module):
num_embeddings=num_embeddings, embedding_dim=embedding_dim, decay=decay,
)
-
def encode(self, x: Tensor) -> Tensor:
"""Encodes input to a latent code."""
z_e = self.encoder(x)
diff --git a/training/conf/config.yaml b/training/conf/config.yaml
index c606366..5897d87 100644
--- a/training/conf/config.yaml
+++ b/training/conf/config.yaml
@@ -6,11 +6,13 @@ defaults:
- datamodule: iam_extended_paragraphs
- hydra: default
- logger: wandb
- - lr_scheduler: one_cycle
+ - lr_schedulers:
+ - one_cycle
- mapping: word_piece
- model: lit_transformer
- network: conv_transformer
- - optimizer: madgrad
+ - optimizers:
+ - madgrad
- trainer: default
seed: 4711
@@ -32,7 +34,9 @@ work_dir: ${hydra:runtime.cwd}
debug: False
# pretty print config at the start of the run using Rich library
-print_config: True
+print_config: false
# disable python warnings if they annoy you
-ignore_warnings: True
+ignore_warnings: true
+
+summary: null # [1, 576, 640]
diff --git a/training/conf/criterion/mae.yaml b/training/conf/criterion/mae.yaml
new file mode 100644
index 0000000..cb07467
--- /dev/null
+++ b/training/conf/criterion/mae.yaml
@@ -0,0 +1,2 @@
+_target_: torch.nn.L1Loss
+reduction: mean
diff --git a/training/conf/criterion/vqgan_loss.yaml b/training/conf/criterion/vqgan_loss.yaml
new file mode 100644
index 0000000..a1c886e
--- /dev/null
+++ b/training/conf/criterion/vqgan_loss.yaml
@@ -0,0 +1,12 @@
+_target_: text_recognizer.criterions.vqgan_loss.VQGANLoss
+reconstruction_loss:
+ _target_: torch.nn.L1Loss
+ reduction: mean
+discriminator:
+ _target_: text_recognizer.criterions.n_layer_discriminator.NLayerDiscriminator
+ in_channels: 1
+ num_channels: 32
+ num_layers: 3
+vq_loss_weight: 1.0
+discriminator_weight: 1.0
+
diff --git a/training/conf/experiment/vqgan.yaml b/training/conf/experiment/vqgan.yaml
new file mode 100644
index 0000000..3d97892
--- /dev/null
+++ b/training/conf/experiment/vqgan.yaml
@@ -0,0 +1,55 @@
+# @package _global_
+
+defaults:
+ - override /network: vqvae
+ - override /criterion: vqgan_loss
+ - override /model: lit_vqgan
+ - override /callbacks: wandb_vae
+ - override /lr_schedulers: null
+
+datamodule:
+ batch_size: 8
+
+lr_schedulers:
+ - generator:
+ T_max: 256
+ eta_min: 0.0
+ last_epoch: -1
+
+ interval: epoch
+ monitor: val/loss
+
+ - discriminator:
+ T_max: 256
+ eta_min: 0.0
+ last_epoch: -1
+
+ interval: epoch
+ monitor: val/loss
+
+optimizer:
+ - generator:
+ _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+ T_max: 256
+ eta_min: 0.0
+ last_epoch: -1
+
+ interval: epoch
+ monitor: val/loss
+ parameters: network
+
+ - discriminator:
+ _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+ T_max: 256
+ eta_min: 0.0
+ last_epoch: -1
+
+ interval: epoch
+ monitor: val/loss
+ parameters: loss_fn
+
+trainer:
+ max_epochs: 256
+ # gradient_clip_val: 0.25
+
+summary: null
diff --git a/training/conf/experiment/vqvae.yaml b/training/conf/experiment/vqvae.yaml
index 7a9e643..397a039 100644
--- a/training/conf/experiment/vqvae.yaml
+++ b/training/conf/experiment/vqvae.yaml
@@ -2,17 +2,18 @@
defaults:
- override /network: vqvae
- - override /criterion: mse
+ - override /criterion: mae
- override /model: lit_vqvae
- override /callbacks: wandb_vae
- - override /lr_scheduler: cosine_annealing
+ - override /lr_schedulers:
+ - cosine_annealing
trainer:
- max_epochs: 64
+ max_epochs: 256
# gradient_clip_val: 0.25
datamodule:
- batch_size: 16
+ batch_size: 8
# lr_scheduler:
# epochs: 64
@@ -21,4 +22,4 @@ datamodule:
# optimizer:
# lr: 1.0e-3
-summary: [1, 576, 640]
+summary: null
diff --git a/training/conf/experiment/vqvae_pixelcnn.yaml b/training/conf/experiment/vqvae_pixelcnn.yaml
new file mode 100644
index 0000000..4fae782
--- /dev/null
+++ b/training/conf/experiment/vqvae_pixelcnn.yaml
@@ -0,0 +1,24 @@
+# @package _global_
+
+defaults:
+ - override /network: vqvae_pixelcnn
+ - override /criterion: mae
+ - override /model: lit_vqvae
+ - override /callbacks: wandb_vae
+ - override /lr_schedulers:
+ - cosine_annealing
+
+trainer:
+ max_epochs: 256
+ # gradient_clip_val: 0.25
+
+datamodule:
+ batch_size: 8
+
+# lr_scheduler:
+ # epochs: 64
+ # steps_per_epoch: 1245
+
+# optimizer:
+ # lr: 1.0e-3
+
diff --git a/training/conf/lr_scheduler/cosine_annealing.yaml b/training/conf/lr_scheduler/cosine_annealing.yaml
index 62667bb..c53ee3a 100644
--- a/training/conf/lr_scheduler/cosine_annealing.yaml
+++ b/training/conf/lr_scheduler/cosine_annealing.yaml
@@ -1,7 +1,8 @@
-_target_: torch.optim.lr_scheduler.CosineAnnealingLR
-T_max: 64
-eta_min: 0.0
-last_epoch: -1
+cosine_annealing:
+ _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+ T_max: 256
+ eta_min: 0.0
+ last_epoch: -1
-interval: epoch
-monitor: val/loss
+ interval: epoch
+ monitor: val/loss
diff --git a/training/conf/lr_scheduler/one_cycle.yaml b/training/conf/lr_scheduler/one_cycle.yaml
index fb5987a..c60577a 100644
--- a/training/conf/lr_scheduler/one_cycle.yaml
+++ b/training/conf/lr_scheduler/one_cycle.yaml
@@ -1,19 +1,20 @@
-_target_: torch.optim.lr_scheduler.OneCycleLR
-max_lr: 1.0e-3
-total_steps: null
-epochs: 512
-steps_per_epoch: 4992
-pct_start: 0.3
-anneal_strategy: cos
-cycle_momentum: true
-base_momentum: 0.85
-max_momentum: 0.95
-div_factor: 25.0
-final_div_factor: 10000.0
-three_phase: true
-last_epoch: -1
-verbose: false
+onc_cycle:
+ _target_: torch.optim.lr_scheduler.OneCycleLR
+ max_lr: 1.0e-3
+ total_steps: null
+ epochs: 512
+ steps_per_epoch: 4992
+ pct_start: 0.3
+ anneal_strategy: cos
+ cycle_momentum: true
+ base_momentum: 0.85
+ max_momentum: 0.95
+ div_factor: 25.0
+ final_div_factor: 10000.0
+ three_phase: true
+ last_epoch: -1
+ verbose: false
-# Non-class arguments
-interval: step
-monitor: val/loss
+ # Non-class arguments
+ interval: step
+ monitor: val/loss
diff --git a/training/conf/network/encoder/pixelcnn_decoder.yaml b/training/conf/network/decoder/pixelcnn_decoder.yaml
index 3895164..cdddb7a 100644
--- a/training/conf/network/encoder/pixelcnn_decoder.yaml
+++ b/training/conf/network/decoder/pixelcnn_decoder.yaml
@@ -1,5 +1,5 @@
_target_: text_recognizer.networks.vqvae.pixelcnn.Decoder
out_channels: 1
hidden_dim: 8
-channels_multipliers: [8, 8, 2, 1]
+channels_multipliers: [8, 2, 1]
dropout_rate: 0.25
diff --git a/training/conf/network/decoder/vae_decoder.yaml b/training/conf/network/decoder/vae_decoder.yaml
index 0a36a54..a5e7286 100644
--- a/training/conf/network/decoder/vae_decoder.yaml
+++ b/training/conf/network/decoder/vae_decoder.yaml
@@ -1,5 +1,5 @@
_target_: text_recognizer.networks.vqvae.decoder.Decoder
out_channels: 1
hidden_dim: 32
-channels_multipliers: [4, 4, 2, 1]
+channels_multipliers: [8, 8, 4, 1]
dropout_rate: 0.25
diff --git a/training/conf/network/decoder/pixelcnn_encoder.yaml b/training/conf/network/encoder/pixelcnn_encoder.yaml
index 47a130d..f12957b 100644
--- a/training/conf/network/decoder/pixelcnn_encoder.yaml
+++ b/training/conf/network/encoder/pixelcnn_encoder.yaml
@@ -1,5 +1,5 @@
_target_: text_recognizer.networks.vqvae.pixelcnn.Encoder
in_channels: 1
hidden_dim: 8
-channels_multipliers: [1, 2, 8, 8]
+channels_multipliers: [1, 2, 8]
dropout_rate: 0.25
diff --git a/training/conf/network/encoder/vae_encoder.yaml b/training/conf/network/encoder/vae_encoder.yaml
index dacd389..58e905d 100644
--- a/training/conf/network/encoder/vae_encoder.yaml
+++ b/training/conf/network/encoder/vae_encoder.yaml
@@ -1,5 +1,5 @@
_target_: text_recognizer.networks.vqvae.encoder.Encoder
in_channels: 1
hidden_dim: 32
-channels_multipliers: [1, 2, 4, 4]
+channels_multipliers: [1, 2, 4, 8, 8]
dropout_rate: 0.25
diff --git a/training/conf/network/vqvae.yaml b/training/conf/network/vqvae.yaml
index d97e9b6..835d0b7 100644
--- a/training/conf/network/vqvae.yaml
+++ b/training/conf/network/vqvae.yaml
@@ -3,7 +3,7 @@ defaults:
- decoder: vae_decoder
_target_: text_recognizer.networks.vqvae.vqvae.VQVAE
-hidden_dim: 128
+hidden_dim: 256
embedding_dim: 32
num_embeddings: 1024
decay: 0.99
diff --git a/training/conf/network/vqvae_pixelcnn.yaml b/training/conf/network/vqvae_pixelcnn.yaml
index 10200bc..cd850af 100644
--- a/training/conf/network/vqvae_pixelcnn.yaml
+++ b/training/conf/network/vqvae_pixelcnn.yaml
@@ -5,5 +5,5 @@ defaults:
_target_: text_recognizer.networks.vqvae.vqvae.VQVAE
hidden_dim: 64
embedding_dim: 32
-num_embeddings: 512
+num_embeddings: 1024
decay: 0.99
diff --git a/training/conf/optimizer/madgrad.yaml b/training/conf/optimizer/madgrad.yaml
index 458b116..a6c059d 100644
--- a/training/conf/optimizer/madgrad.yaml
+++ b/training/conf/optimizer/madgrad.yaml
@@ -1,5 +1,8 @@
-_target_: madgrad.MADGRAD
-lr: 3.0e-4
-momentum: 0.9
-weight_decay: 0
-eps: 1.0e-6
+madgrad:
+ _target_: madgrad.MADGRAD
+ lr: 1.0e-3
+ momentum: 0.9
+ weight_decay: 0
+ eps: 1.0e-6
+
+ parameters: network
diff --git a/training/run.py b/training/run.py
index a2529b0..0cf52e3 100644
--- a/training/run.py
+++ b/training/run.py
@@ -50,8 +50,8 @@ def run(config: DictConfig) -> Optional[float]:
mapping=mapping,
network=network,
loss_fn=loss_fn,
- optimizer_config=config.optimizer,
- lr_scheduler_config=config.lr_scheduler,
+ optimizer_configs=config.optimizers,
+ lr_scheduler_configs=config.lr_schedulers,
_recursive_=False,
)