1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
"""CNN encoder for the VQ-VAE."""
from typing import Sequence, Optional, Tuple, Type
import torch
from torch import nn
from torch import Tensor
from text_recognizer.networks.util import activation_function
from text_recognizer.networks.vqvae.vector_quantizer import VectorQuantizer
class _ResidualBlock(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
dropout: Optional[Type[nn.Module]],
) -> None:
super().__init__()
self.block = [
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=1, bias=False),
]
if dropout is not None:
self.block.append(dropout)
self.block = nn.Sequential(*self.block)
def forward(self, x: Tensor) -> Tensor:
"""Apply the residual forward pass."""
return x + self.block(x)
class Encoder(nn.Module):
"""A CNN encoder network."""
def __init__(
self,
in_channels: int,
channels: Sequence[int],
kernel_sizes: Sequence[int],
strides: Sequence[int],
num_residual_layers: int,
embedding_dim: int,
num_embeddings: int,
beta: float = 0.25,
activation: str = "leaky_relu",
dropout_rate: float = 0.0,
) -> None:
super().__init__()
if dropout_rate:
if activation == "selu":
dropout = nn.AlphaDropout(p=dropout_rate)
else:
dropout = nn.Dropout(p=dropout_rate)
else:
dropout = None
self.embedding_dim = embedding_dim
self.num_embeddings = num_embeddings
self.beta = beta
activation = activation_function(activation)
# Configure encoder.
self.encoder = self._build_encoder(
in_channels,
channels,
kernel_sizes,
strides,
num_residual_layers,
activation,
dropout,
)
# Configure Vector Quantizer.
self.vector_quantizer = VectorQuantizer(
self.num_embeddings, self.embedding_dim, self.beta
)
@staticmethod
def _build_compression_block(
in_channels: int,
channels: int,
kernel_sizes: Sequence[int],
strides: Sequence[int],
activation: Type[nn.Module],
dropout: Optional[Type[nn.Module]],
) -> nn.ModuleList:
modules = nn.ModuleList([])
configuration = zip(channels, kernel_sizes, strides)
for out_channels, kernel_size, stride in configuration:
modules.append(
nn.Sequential(
nn.Conv2d(
in_channels, out_channels, kernel_size, stride=stride, padding=1
),
activation,
)
)
if dropout is not None:
modules.append(dropout)
in_channels = out_channels
return modules
def _build_encoder(
self,
in_channels: int,
channels: int,
kernel_sizes: Sequence[int],
strides: Sequence[int],
num_residual_layers: int,
activation: Type[nn.Module],
dropout: Optional[Type[nn.Module]],
) -> nn.Sequential:
encoder = nn.ModuleList([])
# compression module
encoder.extend(
self._build_compression_block(
in_channels, channels, kernel_sizes, strides, activation, dropout
)
)
# Bottleneck module.
encoder.extend(
nn.ModuleList(
[
_ResidualBlock(channels[-1], channels[-1], dropout)
for i in range(num_residual_layers)
]
)
)
encoder.append(
nn.Conv2d(
channels[-1],
self.embedding_dim,
kernel_size=1,
stride=1,
)
)
return nn.Sequential(*encoder)
def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
"""Encodes input into a discrete representation."""
z_e = self.encoder(x)
z_q, vq_loss = self.vector_quantizer(z_e)
return z_q, vq_loss
|