summaryrefslogtreecommitdiff
path: root/text_recognizer/networks/conv_perceiver.py
blob: 551f04f86333b24243f202037ac05bbdcb6fa298 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""Perceiver network module."""
from typing import Optional, Tuple, Type

from einops import repeat
import torch
from torch import nn, Tensor

from text_recognizer.networks.perceiver.perceiver import PerceiverIO
from text_recognizer.networks.transformer.embeddings.absolute import (
    AbsolutePositionalEmbedding,
)
from text_recognizer.networks.transformer.embeddings.axial import (
    AxialPositionalEmbedding,
)


class ConvPerceiver(nn.Module):
    """Base transformer network."""

    def __init__(
        self,
        input_dims: Tuple[int, int, int],
        hidden_dim: int,
        queries_dim: int,
        num_queries: int,
        num_classes: int,
        pad_index: Tensor,
        encoder: Type[nn.Module],
        decoder: PerceiverIO,
        max_length: int,
        pixel_embedding: AxialPositionalEmbedding,
        query_pos_emb: AbsolutePositionalEmbedding,
    ) -> None:
        super().__init__()
        self.input_dims = input_dims
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.pad_index = pad_index
        self.max_length = max_length
        self.encoder = encoder
        self.decoder = decoder
        self.pixel_embedding = pixel_embedding
        self.query_pos_emb = query_pos_emb
        self.queries = nn.Parameter(torch.randn(num_queries, queries_dim))

    def encode(self, x: Tensor) -> Tensor:
        z = self.encoder(x)
        z = torch.concat([z, self.pixel_embedding(z)], dim=1)
        z = z.flatten(start_dim=2)
        # Permute tensor from [B, E, Ho * Wo] to [B, Sx, E]
        z = z.permute(0, 2, 1)
        return z

    def decode(self, z: Tensor) -> Tensor:
        b = z.shape[0]
        queries = repeat(self.queries, "n d -> b n d", b=b)
        pos_emb = repeat(self.query_pos_emb(queries), "n d -> b n d", b=b)
        queries = torch.concat([queries, pos_emb], dim=-1)
        logits = self.decoder(data=z, queries=queries)
        logits = logits.permute(0, 2, 1)  # [B, C, Sy]
        return logits

    def forward(self, x: Tensor) -> Tensor:
        z = self.encode(x)
        logits = self.decode(z)
        return logits