1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
"""Encodes images to latent embeddings."""
from typing import Tuple, Type
from torch import Tensor, nn
from text_recognizer.networks.transformer.embeddings.axial import (
AxialPositionalEmbeddingImage,
)
class ImageEncoder(nn.Module):
"""Encodes images to latent embeddings."""
def __init__(
self,
encoder: Type[nn.Module],
pixel_embedding: AxialPositionalEmbeddingImage,
) -> None:
super().__init__()
self.encoder = encoder
self.pixel_embedding = pixel_embedding
def forward(self, img: Tensor) -> Tensor:
"""Encodes an image into a latent feature vector.
Args:
img (Tensor): Image tensor.
Shape:
- x: :math: `(B, C, H, W)`
- z: :math: `(B, Sx, D)`
where Sx is the length of the flattened feature maps projected from
the encoder. D latent dimension for each pixel in the projected
feature maps.
Returns:
Tensor: A Latent embedding of the image.
"""
z = self.encoder(img)
z = z + self.pixel_embedding(z)
z = z.flatten(start_dim=2)
# Permute tensor from [B, E, Ho * Wo] to [B, Sx, E]
z = z.permute(0, 2, 1)
return z
|