diff options
Diffstat (limited to 'text_recognizer/networks')
-rw-r--r-- | text_recognizer/networks/conv_transformer.py | 122 | ||||
-rw-r--r-- | text_recognizer/networks/image_encoder.py | 45 | ||||
-rw-r--r-- | text_recognizer/networks/text_decoder.py | 62 |
3 files changed, 124 insertions, 105 deletions
diff --git a/text_recognizer/networks/conv_transformer.py b/text_recognizer/networks/conv_transformer.py index 6d54918..e36a786 100644 --- a/text_recognizer/networks/conv_transformer.py +++ b/text_recognizer/networks/conv_transformer.py @@ -1,13 +1,10 @@ """Base network module.""" -from typing import Optional, Tuple, Type +from typing import Type import torch from torch import Tensor, nn from text_recognizer.networks.transformer.decoder import Decoder -from text_recognizer.networks.transformer.embeddings.axial import ( - AxialPositionalEmbeddingImage, -) class ConvTransformer(nn.Module): @@ -15,124 +12,39 @@ class ConvTransformer(nn.Module): def __init__( self, - input_dims: Tuple[int, int, int], - hidden_dim: int, - num_classes: int, - pad_index: Tensor, encoder: Type[nn.Module], decoder: Decoder, - pixel_embedding: AxialPositionalEmbeddingImage, - token_pos_embedding: Type[nn.Module], ) -> None: super().__init__() - self.input_dims = input_dims - self.hidden_dim = hidden_dim - self.num_classes = num_classes - self.pad_index = pad_index self.encoder = encoder self.decoder = decoder - # Token embedding. - self.token_embedding = nn.Embedding( - num_embeddings=self.num_classes, embedding_dim=self.hidden_dim - ) + def encode(self, img: Tensor) -> Tensor: + """Encodes images to latent representation.""" + return self.encoder(img) - # Positional encoding for decoder tokens. - self.token_pos_embedding = token_pos_embedding - self.pixel_embedding = pixel_embedding + def decode(self, tokens: Tensor, img_features: Tensor) -> Tensor: + """Decodes latent images embedding into characters.""" + return self.decoder(tokens, img_features) - # Latent projector for down sampling number of filters and 2d - # positional encoding. - self.conv = nn.Conv2d( - in_channels=self.encoder.out_channels, - out_channels=self.hidden_dim, - kernel_size=1, - ) - - # Output layer - self.to_logits = nn.Linear( - in_features=self.hidden_dim, out_features=self.num_classes - ) - - # Initalize weights for encoder. - self.init_weights() - - def init_weights(self) -> None: - """Initalize weights for decoder network and to_logits.""" - nn.init.kaiming_normal_(self.token_embedding.weight) - - def encode(self, x: Tensor) -> Tensor: - """Encodes an image into a latent feature vector. - - Args: - x (Tensor): Image tensor. - - Shape: - - x: :math: `(B, C, H, W)` - - z: :math: `(B, Sx, E)` - - where Sx is the length of the flattened feature maps projected from - the encoder. E latent dimension for each pixel in the projected - feature maps. - - Returns: - Tensor: A Latent embedding of the image. - """ - z = self.encoder(x) - z = self.conv(z) - z = z + self.pixel_embedding(z) - z = z.flatten(start_dim=2) - - # Permute tensor from [B, E, Ho * Wo] to [B, Sx, E] - z = z.permute(0, 2, 1) - return z - - def decode(self, src: Tensor, trg: Tensor) -> Tensor: - """Decodes latent images embedding into word pieces. - - Args: - src (Tensor): Latent images embedding. - trg (Tensor): Word embeddings. - - Shapes: - - z: :math: `(B, Sx, D)` - - context: :math: `(B, Sy)` - - out: :math: `(B, Sy, C)` - - where Sy is the length of the output and C is the number of classes. - - Returns: - Tensor: Sequence of word piece embeddings. - """ - trg = trg.long() - trg_mask = trg != self.pad_index - trg = self.token_embedding(trg) - trg = trg + self.token_pos_embedding(trg) - out = self.decoder(x=trg, context=src, input_mask=trg_mask) - logits = ( - out @ torch.transpose(self.token_embedding.weight.to(trg.dtype), 0, 1) - ).float() - logits = self.to_logits(out) # [B, Sy, C] - logits = logits.permute(0, 2, 1) # [B, C, Sy] - return logits - - def forward(self, x: Tensor, context: Tensor) -> Tensor: + def forward(self, img: Tensor, tokens: Tensor) -> Tensor: """Encodes images into word piece logtis. Args: - x (Tensor): Input image(s). - context (Tensor): Target word embeddings. + img (Tensor): Input image(s). + tokens (Tensor): Target word embeddings. Shapes: - - x: :math: `(B, D, H, W)` - - context: :math: `(B, Sy, C)` + - img: :math: `(B, 1, H, W)` + - tokens: :math: `(B, Sy)` + - logits: :math: `(B, Sy, C)` - where B is the batch size, D is the number of input channels, H is - the image height, W is the image width, and C is the number of classes. + where B is the batch size, H is the image height, W is the image + width, Sy the output length, and C is the number of classes. Returns: Tensor: Sequence of logits. """ - z = self.encode(x) - logits = self.decode(z, context) + img_features = self.encode(img) + logits = self.decode(tokens, img_features) return logits diff --git a/text_recognizer/networks/image_encoder.py b/text_recognizer/networks/image_encoder.py new file mode 100644 index 0000000..b5fd0c5 --- /dev/null +++ b/text_recognizer/networks/image_encoder.py @@ -0,0 +1,45 @@ +"""Encodes images to latent embeddings.""" +from typing import Tuple, Type + +from torch import Tensor, nn + +from text_recognizer.networks.transformer.embeddings.axial import ( + AxialPositionalEmbeddingImage, +) + + +class ImageEncoder(nn.Module): + """Base transformer network.""" + + def __init__( + self, + encoder: Type[nn.Module], + pixel_embedding: AxialPositionalEmbeddingImage, + ) -> None: + super().__init__() + self.encoder = encoder + self.pixel_embedding = pixel_embedding + + def forward(self, img: Tensor) -> Tensor: + """Encodes an image into a latent feature vector. + + Args: + img (Tensor): Image tensor. + + Shape: + - x: :math: `(B, C, H, W)` + - z: :math: `(B, Sx, D)` + + where Sx is the length of the flattened feature maps projected from + the encoder. D latent dimension for each pixel in the projected + feature maps. + + Returns: + Tensor: A Latent embedding of the image. + """ + z = self.encoder(img) + z = z + self.pixel_embedding(z) + z = z.flatten(start_dim=2) + # Permute tensor from [B, E, Ho * Wo] to [B, Sx, E] + z = z.permute(0, 2, 1) + return z diff --git a/text_recognizer/networks/text_decoder.py b/text_recognizer/networks/text_decoder.py new file mode 100644 index 0000000..c054b41 --- /dev/null +++ b/text_recognizer/networks/text_decoder.py @@ -0,0 +1,62 @@ +"""Text decoder.""" +from typing import Type + +import torch +from torch import Tensor, nn + +from text_recognizer.networks.transformer.decoder import Decoder + + +class TextDecoder(nn.Module): + """Decoder transformer network.""" + + def __init__( + self, + hidden_dim: int, + num_classes: int, + pad_index: Tensor, + decoder: Decoder, + token_pos_embedding: Type[nn.Module], + ) -> None: + super().__init__() + self.hidden_dim = hidden_dim + self.num_classes = num_classes + self.pad_index = pad_index + self.decoder = decoder + self.token_embedding = nn.Embedding( + num_embeddings=self.num_classes, embedding_dim=self.hidden_dim + ) + self.token_pos_embedding = token_pos_embedding + self.to_logits = nn.Linear( + in_features=self.hidden_dim, out_features=self.num_classes + ) + + def forward(self, tokens: Tensor, img_features: Tensor) -> Tensor: + """Decodes latent images embedding into word pieces. + + Args: + tokens (Tensor): Token indecies. + img_features (Tensor): Latent images embedding. + + Shapes: + - tokens: :math: `(B, Sy)` + - img_features: :math: `(B, Sx, D)` + - logits: :math: `(B, Sy, C)` + + where Sy is the length of the output, C is the number of classes + and D is the hidden dimension. + + Returns: + Tensor: Sequence of logits. + """ + tokens = tokens.long() + mask = tokens != self.pad_index + tokens = self.token_embedding(tokens) + tokens = tokens + self.token_pos_embedding(tokens) + tokens = self.decoder(x=tokens, context=img_features, mask=mask) + logits = ( + tokens @ torch.transpose(self.token_embedding.weight.to(tokens.dtype), 0, 1) + ).float() + logits = self.to_logits(tokens) # [B, Sy, C] + logits = logits.permute(0, 2, 1) # [B, C, Sy] + return logits |