text_recognizer/networks/vqvae/decoder.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

"""CNN decoder for the VQ-VAE."""

from typing import List, Optional, Tuple, Type

import torch
from torch import nn
from torch import Tensor

from text_recognizer.networks.util import activation_function
from text_recognizer.networks.vqvae.encoder import _ResidualBlock


class Decoder(nn.Module):
    """A CNN encoder network."""

    def __init__(
        self,
        channels: List[int],
        kernel_sizes: List[int],
        strides: List[int],
        num_residual_layers: int,
        embedding_dim: int,
        upsampling: Optional[List[List[int]]] = None,
        activation: str = "leaky_relu",
        dropout_rate: float = 0.0,
    ) -> None:
        super().__init__()

        if dropout_rate:
            if activation == "selu":
                dropout = nn.AlphaDropout(p=dropout_rate)
            else:
                dropout = nn.Dropout(p=dropout_rate)
        else:
            dropout = None

        self.upsampling = upsampling

        self.res_block = nn.ModuleList([])
        self.upsampling_block = nn.ModuleList([])

        self.embedding_dim = embedding_dim
        activation = activation_function(activation)

        # Configure encoder.
        self.decoder = self._build_decoder(
            channels,
            kernel_sizes,
            strides,
            num_residual_layers,
            activation,
            dropout,
        )

    def _build_decompression_block(
        self,
        in_channels: int,
        channels: int,
        kernel_sizes: List[int],
        strides: List[int],
        activation: Type[nn.Module],
        dropout: Optional[Type[nn.Module]],
    ) -> nn.ModuleList:
        modules = nn.ModuleList([])
        configuration = zip(channels, kernel_sizes, strides)
        for i, (out_channels, kernel_size, stride) in enumerate(configuration):
            modules.append(
                nn.Sequential(
                    nn.ConvTranspose2d(
                        in_channels,
                        out_channels,
                        kernel_size,
                        stride=stride,
                        padding=1,
                    ),
                    activation,
                )
            )

            if self.upsampling and i < len(self.upsampling):
                modules.append(
                    nn.Upsample(size=self.upsampling[i]),
                )

            if dropout is not None:
                modules.append(dropout)

            in_channels = out_channels

        modules.extend(
            nn.Sequential(
                nn.ConvTranspose2d(
                    in_channels, 1, kernel_size=kernel_size, stride=stride, padding=1
                ),
                nn.Tanh(),
            )
        )

        return modules

    def _build_decoder(
        self,
        channels: int,
        kernel_sizes: List[int],
        strides: List[int],
        num_residual_layers: int,
        activation: Type[nn.Module],
        dropout: Optional[Type[nn.Module]],
    ) -> nn.Sequential:

        self.res_block.append(
            nn.Conv2d(
                self.embedding_dim,
                channels[0],
                kernel_size=1,
                stride=1,
            )
        )

        # Bottleneck module.
        self.res_block.extend(
            nn.ModuleList(
                [
                    _ResidualBlock(channels[0], channels[0], dropout)
                    for i in range(num_residual_layers)
                ]
            )
        )

        # Decompression module
        self.upsampling_block.extend(
            self._build_decompression_block(
                channels[0], channels[1:], kernel_sizes, strides, activation, dropout
            )
        )

        self.res_block = nn.Sequential(*self.res_block)
        self.upsampling_block = nn.Sequential(*self.upsampling_block)

        return nn.Sequential(self.res_block, self.upsampling_block)

    def forward(self, z_q: Tensor) -> Tensor:
        """Reconstruct input from given codes."""
        x_reconstruction = self.decoder(z_q)
        return x_reconstruction