text_recognizer/networks/vqvae/encoder.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

"""CNN encoder for the VQ-VAE."""
from typing import Sequence, Optional, Tuple, Type

import torch
from torch import nn
from torch import Tensor

from text_recognizer.networks.util import activation_function
from text_recognizer.networks.vqvae.vector_quantizer import VectorQuantizer


class _ResidualBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        dropout: Optional[Type[nn.Module]],
    ) -> None:
        super().__init__()
        self.block = [
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=1, bias=False),
        ]

        if dropout is not None:
            self.block.append(dropout)

        self.block = nn.Sequential(*self.block)

    def forward(self, x: Tensor) -> Tensor:
        """Apply the residual forward pass."""
        return x + self.block(x)


class Encoder(nn.Module):
    """A CNN encoder network."""

    def __init__(
        self,
        in_channels: int,
        channels: Sequence[int],
        kernel_sizes: Sequence[int],
        strides: Sequence[int],
        num_residual_layers: int,
        embedding_dim: int,
        num_embeddings: int,
        beta: float = 0.25,
        activation: str = "leaky_relu",
        dropout_rate: float = 0.0,
    ) -> None:
        super().__init__()

        if dropout_rate:
            if activation == "selu":
                dropout = nn.AlphaDropout(p=dropout_rate)
            else:
                dropout = nn.Dropout(p=dropout_rate)
        else:
            dropout = None

        self.embedding_dim = embedding_dim
        self.num_embeddings = num_embeddings
        self.beta = beta
        activation = activation_function(activation)

        # Configure encoder.
        self.encoder = self._build_encoder(
            in_channels,
            channels,
            kernel_sizes,
            strides,
            num_residual_layers,
            activation,
            dropout,
        )

        # Configure Vector Quantizer.
        self.vector_quantizer = VectorQuantizer(
            self.num_embeddings, self.embedding_dim, self.beta
        )

    @staticmethod
    def _build_compression_block(
        in_channels: int,
        channels: int,
        kernel_sizes: Sequence[int],
        strides: Sequence[int],
        activation: Type[nn.Module],
        dropout: Optional[Type[nn.Module]],
    ) -> nn.ModuleList:
        modules = nn.ModuleList([])
        configuration = zip(channels, kernel_sizes, strides)
        for out_channels, kernel_size, stride in configuration:
            modules.append(
                nn.Sequential(
                    nn.Conv2d(
                        in_channels, out_channels, kernel_size, stride=stride, padding=1
                    ),
                    activation,
                )
            )

            if dropout is not None:
                modules.append(dropout)

            in_channels = out_channels

        return modules

    def _build_encoder(
        self,
        in_channels: int,
        channels: int,
        kernel_sizes: Sequence[int],
        strides: Sequence[int],
        num_residual_layers: int,
        activation: Type[nn.Module],
        dropout: Optional[Type[nn.Module]],
    ) -> nn.Sequential:
        encoder = nn.ModuleList([])

        # compression module
        encoder.extend(
            self._build_compression_block(
                in_channels, channels, kernel_sizes, strides, activation, dropout
            )
        )

        # Bottleneck module.
        encoder.extend(
            nn.ModuleList(
                [
                    _ResidualBlock(channels[-1], channels[-1], dropout)
                    for i in range(num_residual_layers)
                ]
            )
        )

        encoder.append(
            nn.Conv2d(
                channels[-1],
                self.embedding_dim,
                kernel_size=1,
                stride=1,
            )
        )

        return nn.Sequential(*encoder)

    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
        """Encodes input into a discrete representation."""
        z_e = self.encoder(x)
        z_q, vq_loss = self.vector_quantizer(z_e)
        return z_q, vq_loss