1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
"""Axial transformer encoder."""
from typing import List, Optional, Type
from text_recognizer.networks.transformer.embeddings.axial import (
AxialPositionalEmbeddingImage,
)
from torch import nn, Tensor
from text_recognizer.networks.transformer.axial_attention.self_attention import (
SelfAttention,
)
from text_recognizer.networks.transformer.axial_attention.utils import (
calculate_permutations,
PermuteToForm,
Sequential,
)
from text_recognizer.networks.transformer.norm import PreNorm
class AxialEncoder(nn.Module):
"""Axial transfomer encoder."""
def __init__(
self,
shape: List[int],
dim: int,
depth: int,
heads: int,
dim_head: int,
dim_index: int,
axial_embedding: AxialPositionalEmbeddingImage,
) -> None:
super().__init__()
self.shape = shape
self.dim = dim
self.depth = depth
self.heads = heads
self.dim_head = dim_head
self.dim_index = dim_index
self.axial_embedding = axial_embedding
self.fn = self._build()
def _build(self) -> Sequential:
permutations = calculate_permutations(2, self.dim_index)
get_ff = lambda: nn.Sequential(
nn.LayerNorm([self.dim, *self.shape]),
nn.Conv2d(
in_channels=self.dim,
out_channels=4 * self.dim,
kernel_size=3,
padding=1,
),
nn.Mish(inplace=True),
nn.Conv2d(
in_channels=4 * self.dim,
out_channels=self.dim,
kernel_size=3,
padding=1,
),
)
layers = nn.ModuleList([])
for _ in range(self.depth):
attns = nn.ModuleList(
[
PermuteToForm(
permutation=permutation,
fn=PreNorm(
self.dim,
SelfAttention(
dim=self.dim, heads=self.heads, dim_head=self.dim_head
),
),
)
for permutation in permutations
]
)
convs = nn.ModuleList([get_ff(), get_ff()])
layers.append(attns)
layers.append(convs)
return Sequential(layers)
def forward(self, x: Tensor) -> Tensor:
"""Applies fn to input."""
x += self.axial_embedding(x)
return self.fn(x)
|