From 240f5e9f20032e82515fa66ce784619527d1041e Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Sun, 8 Aug 2021 19:59:55 +0200 Subject: Add VQGAN and loss function --- text_recognizer/networks/vqvae/decoder.py | 13 ++++++++++--- text_recognizer/networks/vqvae/encoder.py | 11 +++++++++-- text_recognizer/networks/vqvae/norm.py | 6 +++++- text_recognizer/networks/vqvae/pixelcnn.py | 12 ++++-------- text_recognizer/networks/vqvae/residual.py | 8 +++++++- text_recognizer/networks/vqvae/vqvae.py | 1 - 6 files changed, 35 insertions(+), 16 deletions(-) (limited to 'text_recognizer/networks') diff --git a/text_recognizer/networks/vqvae/decoder.py b/text_recognizer/networks/vqvae/decoder.py index f51e0a3..fcbed57 100644 --- a/text_recognizer/networks/vqvae/decoder.py +++ b/text_recognizer/networks/vqvae/decoder.py @@ -12,7 +12,14 @@ from text_recognizer.networks.vqvae.residual import Residual class Decoder(nn.Module): """A CNN encoder network.""" - def __init__(self, out_channels: int, hidden_dim: int, channels_multipliers: Sequence[int], dropout_rate: float, activation: str = "mish") -> None: + def __init__( + self, + out_channels: int, + hidden_dim: int, + channels_multipliers: Sequence[int], + dropout_rate: float, + activation: str = "mish", + ) -> None: super().__init__() self.out_channels = out_channels self.hidden_dim = hidden_dim @@ -33,9 +40,9 @@ class Decoder(nn.Module): use_norm=True, ), ] - + activation_fn = activation_function(self.activation) - out_channels_multipliers = self.channels_multipliers + (1, ) + out_channels_multipliers = self.channels_multipliers + (1,) num_blocks = len(self.channels_multipliers) for i in range(num_blocks): diff --git a/text_recognizer/networks/vqvae/encoder.py b/text_recognizer/networks/vqvae/encoder.py index ad8f950..4a5c976 100644 --- a/text_recognizer/networks/vqvae/encoder.py +++ b/text_recognizer/networks/vqvae/encoder.py @@ -11,7 +11,14 @@ from text_recognizer.networks.vqvae.residual import Residual class Encoder(nn.Module): """A CNN encoder network.""" - def __init__(self, in_channels: int, hidden_dim: int, channels_multipliers: List[int], dropout_rate: float, activation: str = "mish") -> None: + def __init__( + self, + in_channels: int, + hidden_dim: int, + channels_multipliers: List[int], + dropout_rate: float, + activation: str = "mish", + ) -> None: super().__init__() self.in_channels = in_channels self.hidden_dim = hidden_dim @@ -33,7 +40,7 @@ class Encoder(nn.Module): ] num_blocks = len(self.channels_multipliers) - channels_multipliers = (1, ) + self.channels_multipliers + channels_multipliers = (1,) + self.channels_multipliers activation_fn = activation_function(self.activation) for i in range(num_blocks): diff --git a/text_recognizer/networks/vqvae/norm.py b/text_recognizer/networks/vqvae/norm.py index 3e6963a..d73f9f8 100644 --- a/text_recognizer/networks/vqvae/norm.py +++ b/text_recognizer/networks/vqvae/norm.py @@ -6,13 +6,17 @@ from torch import nn, Tensor @attr.s(eq=False) class Normalize(nn.Module): num_channels: int = attr.ib() + num_groups: int = attr.ib(default=32) norm: nn.GroupNorm = attr.ib(init=False) def __attrs_post_init__(self) -> None: """Post init configuration.""" super().__init__() self.norm = nn.GroupNorm( - num_groups=self.num_channels, num_channels=self.num_channels, eps=1.0e-6, affine=True + num_groups=self.num_groups, + num_channels=self.num_channels, + eps=1.0e-6, + affine=True, ) def forward(self, x: Tensor) -> Tensor: diff --git a/text_recognizer/networks/vqvae/pixelcnn.py b/text_recognizer/networks/vqvae/pixelcnn.py index 5c580df..b9e6080 100644 --- a/text_recognizer/networks/vqvae/pixelcnn.py +++ b/text_recognizer/networks/vqvae/pixelcnn.py @@ -44,7 +44,7 @@ class Encoder(nn.Module): ), ] num_blocks = len(self.channels_multipliers) - in_channels_multipliers = (1,) + self.channels_multipliers + in_channels_multipliers = (1,) + self.channels_multipliers for i in range(num_blocks): in_channels = self.hidden_dim * in_channels_multipliers[i] out_channels = self.hidden_dim * self.channels_multipliers[i] @@ -68,7 +68,7 @@ class Encoder(nn.Module): dropout_rate=self.dropout_rate, use_norm=True, ), - Attention(in_channels=self.hidden_dim * self.channels_multipliers[-1]) + Attention(in_channels=self.hidden_dim * self.channels_multipliers[-1]), ] encoder += [ @@ -125,7 +125,7 @@ class Decoder(nn.Module): ), ] - out_channels_multipliers = self.channels_multipliers + (1, ) + out_channels_multipliers = self.channels_multipliers + (1,) num_blocks = len(self.channels_multipliers) for i in range(num_blocks): @@ -140,11 +140,7 @@ class Decoder(nn.Module): ) ) if i == 0: - decoder.append( - Attention( - in_channels=out_channels - ) - ) + decoder.append(Attention(in_channels=out_channels)) decoder.append(Upsample()) decoder += [ diff --git a/text_recognizer/networks/vqvae/residual.py b/text_recognizer/networks/vqvae/residual.py index 4ed3781..46b091d 100644 --- a/text_recognizer/networks/vqvae/residual.py +++ b/text_recognizer/networks/vqvae/residual.py @@ -18,7 +18,13 @@ class Residual(nn.Module): super().__init__() self.block = self._build_res_block() if self.in_channels != self.out_channels: - self.conv_shortcut = nn.Conv2d(in_channels=self.in_channels, out_channels=self.out_channels, kernel_size=3, stride=1, padding=1) + self.conv_shortcut = nn.Conv2d( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + ) else: self.conv_shortcut = None diff --git a/text_recognizer/networks/vqvae/vqvae.py b/text_recognizer/networks/vqvae/vqvae.py index 0646119..e8660c4 100644 --- a/text_recognizer/networks/vqvae/vqvae.py +++ b/text_recognizer/networks/vqvae/vqvae.py @@ -32,7 +32,6 @@ class VQVAE(nn.Module): num_embeddings=num_embeddings, embedding_dim=embedding_dim, decay=decay, ) - def encode(self, x: Tensor) -> Tensor: """Encodes input to a latent code.""" z_e = self.encoder(x) -- cgit v1.2.3-70-g09d2