summaryrefslogtreecommitdiff
path: root/text_recognizer/networks
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2021-08-08 19:59:55 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2021-08-08 19:59:55 +0200
commit240f5e9f20032e82515fa66ce784619527d1041e (patch)
treeb002d28bbfc9abe9b6af090f7db60bea0aeed6e8 /text_recognizer/networks
parentd12f70402371dda586d457af2a3df7fb5b3130ad (diff)
Add VQGAN and loss function
Diffstat (limited to 'text_recognizer/networks')
-rw-r--r--text_recognizer/networks/vqvae/decoder.py13
-rw-r--r--text_recognizer/networks/vqvae/encoder.py11
-rw-r--r--text_recognizer/networks/vqvae/norm.py6
-rw-r--r--text_recognizer/networks/vqvae/pixelcnn.py12
-rw-r--r--text_recognizer/networks/vqvae/residual.py8
-rw-r--r--text_recognizer/networks/vqvae/vqvae.py1
6 files changed, 35 insertions, 16 deletions
diff --git a/text_recognizer/networks/vqvae/decoder.py b/text_recognizer/networks/vqvae/decoder.py
index f51e0a3..fcbed57 100644
--- a/text_recognizer/networks/vqvae/decoder.py
+++ b/text_recognizer/networks/vqvae/decoder.py
@@ -12,7 +12,14 @@ from text_recognizer.networks.vqvae.residual import Residual
class Decoder(nn.Module):
"""A CNN encoder network."""
- def __init__(self, out_channels: int, hidden_dim: int, channels_multipliers: Sequence[int], dropout_rate: float, activation: str = "mish") -> None:
+ def __init__(
+ self,
+ out_channels: int,
+ hidden_dim: int,
+ channels_multipliers: Sequence[int],
+ dropout_rate: float,
+ activation: str = "mish",
+ ) -> None:
super().__init__()
self.out_channels = out_channels
self.hidden_dim = hidden_dim
@@ -33,9 +40,9 @@ class Decoder(nn.Module):
use_norm=True,
),
]
-
+
activation_fn = activation_function(self.activation)
- out_channels_multipliers = self.channels_multipliers + (1, )
+ out_channels_multipliers = self.channels_multipliers + (1,)
num_blocks = len(self.channels_multipliers)
for i in range(num_blocks):
diff --git a/text_recognizer/networks/vqvae/encoder.py b/text_recognizer/networks/vqvae/encoder.py
index ad8f950..4a5c976 100644
--- a/text_recognizer/networks/vqvae/encoder.py
+++ b/text_recognizer/networks/vqvae/encoder.py
@@ -11,7 +11,14 @@ from text_recognizer.networks.vqvae.residual import Residual
class Encoder(nn.Module):
"""A CNN encoder network."""
- def __init__(self, in_channels: int, hidden_dim: int, channels_multipliers: List[int], dropout_rate: float, activation: str = "mish") -> None:
+ def __init__(
+ self,
+ in_channels: int,
+ hidden_dim: int,
+ channels_multipliers: List[int],
+ dropout_rate: float,
+ activation: str = "mish",
+ ) -> None:
super().__init__()
self.in_channels = in_channels
self.hidden_dim = hidden_dim
@@ -33,7 +40,7 @@ class Encoder(nn.Module):
]
num_blocks = len(self.channels_multipliers)
- channels_multipliers = (1, ) + self.channels_multipliers
+ channels_multipliers = (1,) + self.channels_multipliers
activation_fn = activation_function(self.activation)
for i in range(num_blocks):
diff --git a/text_recognizer/networks/vqvae/norm.py b/text_recognizer/networks/vqvae/norm.py
index 3e6963a..d73f9f8 100644
--- a/text_recognizer/networks/vqvae/norm.py
+++ b/text_recognizer/networks/vqvae/norm.py
@@ -6,13 +6,17 @@ from torch import nn, Tensor
@attr.s(eq=False)
class Normalize(nn.Module):
num_channels: int = attr.ib()
+ num_groups: int = attr.ib(default=32)
norm: nn.GroupNorm = attr.ib(init=False)
def __attrs_post_init__(self) -> None:
"""Post init configuration."""
super().__init__()
self.norm = nn.GroupNorm(
- num_groups=self.num_channels, num_channels=self.num_channels, eps=1.0e-6, affine=True
+ num_groups=self.num_groups,
+ num_channels=self.num_channels,
+ eps=1.0e-6,
+ affine=True,
)
def forward(self, x: Tensor) -> Tensor:
diff --git a/text_recognizer/networks/vqvae/pixelcnn.py b/text_recognizer/networks/vqvae/pixelcnn.py
index 5c580df..b9e6080 100644
--- a/text_recognizer/networks/vqvae/pixelcnn.py
+++ b/text_recognizer/networks/vqvae/pixelcnn.py
@@ -44,7 +44,7 @@ class Encoder(nn.Module):
),
]
num_blocks = len(self.channels_multipliers)
- in_channels_multipliers = (1,) + self.channels_multipliers
+ in_channels_multipliers = (1,) + self.channels_multipliers
for i in range(num_blocks):
in_channels = self.hidden_dim * in_channels_multipliers[i]
out_channels = self.hidden_dim * self.channels_multipliers[i]
@@ -68,7 +68,7 @@ class Encoder(nn.Module):
dropout_rate=self.dropout_rate,
use_norm=True,
),
- Attention(in_channels=self.hidden_dim * self.channels_multipliers[-1])
+ Attention(in_channels=self.hidden_dim * self.channels_multipliers[-1]),
]
encoder += [
@@ -125,7 +125,7 @@ class Decoder(nn.Module):
),
]
- out_channels_multipliers = self.channels_multipliers + (1, )
+ out_channels_multipliers = self.channels_multipliers + (1,)
num_blocks = len(self.channels_multipliers)
for i in range(num_blocks):
@@ -140,11 +140,7 @@ class Decoder(nn.Module):
)
)
if i == 0:
- decoder.append(
- Attention(
- in_channels=out_channels
- )
- )
+ decoder.append(Attention(in_channels=out_channels))
decoder.append(Upsample())
decoder += [
diff --git a/text_recognizer/networks/vqvae/residual.py b/text_recognizer/networks/vqvae/residual.py
index 4ed3781..46b091d 100644
--- a/text_recognizer/networks/vqvae/residual.py
+++ b/text_recognizer/networks/vqvae/residual.py
@@ -18,7 +18,13 @@ class Residual(nn.Module):
super().__init__()
self.block = self._build_res_block()
if self.in_channels != self.out_channels:
- self.conv_shortcut = nn.Conv2d(in_channels=self.in_channels, out_channels=self.out_channels, kernel_size=3, stride=1, padding=1)
+ self.conv_shortcut = nn.Conv2d(
+ in_channels=self.in_channels,
+ out_channels=self.out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ )
else:
self.conv_shortcut = None
diff --git a/text_recognizer/networks/vqvae/vqvae.py b/text_recognizer/networks/vqvae/vqvae.py
index 0646119..e8660c4 100644
--- a/text_recognizer/networks/vqvae/vqvae.py
+++ b/text_recognizer/networks/vqvae/vqvae.py
@@ -32,7 +32,6 @@ class VQVAE(nn.Module):
num_embeddings=num_embeddings, embedding_dim=embedding_dim, decay=decay,
)
-
def encode(self, x: Tensor) -> Tensor:
"""Encodes input to a latent code."""
z_e = self.encoder(x)