diff options
Diffstat (limited to 'text_recognizer/networks/transformer/axial_attention')
4 files changed, 209 insertions, 0 deletions
diff --git a/text_recognizer/networks/transformer/axial_attention/__init__.py b/text_recognizer/networks/transformer/axial_attention/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/__init__.py diff --git a/text_recognizer/networks/transformer/axial_attention/encoder.py b/text_recognizer/networks/transformer/axial_attention/encoder.py new file mode 100644 index 0000000..1cadac1 --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/encoder.py @@ -0,0 +1,90 @@ +"""Axial transformer encoder.""" + +from typing import List, Optional, Type +from text_recognizer.networks.transformer.embeddings.axial import ( + AxialPositionalEmbeddingImage, +) + +from torch import nn, Tensor + +from text_recognizer.networks.transformer.axial_attention.self_attention import ( + SelfAttention, +) +from text_recognizer.networks.transformer.axial_attention.utils import ( + calculate_permutations, + PermuteToForm, + Sequential, +) +from text_recognizer.networks.transformer.norm import PreNorm + + +class AxialEncoder(nn.Module): + """Axial transfomer encoder.""" + + def __init__( + self, + shape: List[int], + dim: int, + depth: int, + heads: int, + dim_head: int, + dim_index: int, + axial_embedding: AxialPositionalEmbeddingImage, + ) -> None: + super().__init__() + + self.shape = shape + self.dim = dim + self.depth = depth + self.heads = heads + self.dim_head = dim_head + self.dim_index = dim_index + self.axial_embedding = axial_embedding + + self.fn = self._build() + + def _build(self) -> Sequential: + permutations = calculate_permutations(2, self.dim_index) + get_ff = lambda: nn.Sequential( + nn.LayerNorm([self.dim, *self.shape]), + nn.Conv2d( + in_channels=self.dim, + out_channels=4 * self.dim, + kernel_size=3, + padding=1, + ), + nn.Mish(inplace=True), + nn.Conv2d( + in_channels=4 * self.dim, + out_channels=self.dim, + kernel_size=3, + padding=1, + ), + ) + + layers = nn.ModuleList([]) + for _ in range(self.depth): + attns = nn.ModuleList( + [ + PermuteToForm( + permutation=permutation, + fn=PreNorm( + self.dim, + SelfAttention( + dim=self.dim, heads=self.heads, dim_head=self.dim_head + ), + ), + ) + for permutation in permutations + ] + ) + convs = nn.ModuleList([get_ff(), get_ff()]) + layers.append(attns) + layers.append(convs) + + return Sequential(layers) + + def forward(self, x: Tensor) -> Tensor: + """Applies fn to input.""" + x += self.axial_embedding(x) + return self.fn(x) diff --git a/text_recognizer/networks/transformer/axial_attention/self_attention.py b/text_recognizer/networks/transformer/axial_attention/self_attention.py new file mode 100644 index 0000000..b5e4142 --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/self_attention.py @@ -0,0 +1,40 @@ +"""Axial self attention module.""" + +import torch +from torch import nn +from torch import Tensor + + +class SelfAttention(nn.Module): + """Axial self attention module.""" + + def __init__( + self, + dim: int, + dim_head: int, + heads: int, + ) -> None: + super().__init__() + self.dim_hidden = heads * dim_head + self.heads = heads + self.dim_head = dim_head + self.to_q = nn.Linear(dim, self.dim_hidden, bias=False) + self.to_kv = nn.Linear(dim, 2 * self.dim_hidden, bias=False) + self.to_out = nn.Linear(self.dim_hidden, dim) + + def forward(self, x: Tensor) -> Tensor: + """Applies self attention.""" + q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim=-1)) + b, _, d, h, e = *q.shape, self.heads, self.dim_head + + merge_heads = ( + lambda x: x.reshape(b, -1, h, e).transpose(1, 2).reshape(b * h, -1, e) + ) + q, k, v = map(merge_heads, (q, k, v)) + + energy = torch.einsum("bie,bje->bij", q, k) * (e ** -0.5) + energy = energy.softmax(dim=-1) + attn = torch.einsum("bij,bje->bie", energy, v) + + out = attn.reshape(b, h, -1, e).transpose(1, 2).reshape(b, -1, d) + return self.to_out(out) diff --git a/text_recognizer/networks/transformer/axial_attention/utils.py b/text_recognizer/networks/transformer/axial_attention/utils.py new file mode 100644 index 0000000..2f5bf7e --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/utils.py @@ -0,0 +1,79 @@ +"""Helper functions for axial attention.""" +from operator import itemgetter +from typing import Callable, List, Tuple + +from torch import nn, Tensor + + +def _map_el_ind(arr: Tensor, ind: int) -> List: + return list(map(itemgetter(ind), arr)) + + +def _sort_indices(arr: Tensor) -> Tuple[List[int], List[int]]: + indices = [i for i in range(len(arr))] + arr = zip(arr, indices) + arr = sorted(arr) + return _map_el_ind(arr, 0), _map_el_ind(arr, 1) + + +def calculate_permutations(num_dims: int, emb_dim: int) -> List[List[int]]: + """Returns permutations of tensor.""" + total_dims = num_dims + 2 + axial_dims = [i for i in range(1, total_dims) if i != emb_dim] + + permutations = [] + + for axial_dim in axial_dims: + last_two_dims = [axial_dim, emb_dim] + dims_rest = set(range(0, total_dims)) - set(last_two_dims) + permutation = [*dims_rest, *last_two_dims] + permutations.append(permutation) + + return permutations + + +class PermuteToForm(nn.Module): + """Helper class for applying axial attention.""" + + def __init__( + self, + fn: Callable, + permutation: List[List[int]], + ) -> None: + super().__init__() + + self.fn = fn + self.permutation = permutation + _, self.inv_permutation = _sort_indices(self.permutation) + + def forward(self, x: Tensor) -> Tensor: + """Permutes tensor, applies axial attention, permutes tensor back.""" + x = x.permute(*self.permutation).contiguous() + shape = x.shape + *_, t, d = shape + + # Merge all but axial dimension + x = x.reshape(-1, t, d) + + # Apply attention + x = self.fn(x) + + # Restore original shape and permutation + x = x.reshape(*shape) + x = x.permute(*self.inv_permutation).contiguous() + return x + + +class Sequential(nn.Module): + """Applies a list of paired functions to input.""" + + def __init__(self, fns: nn.ModuleList) -> None: + super().__init__() + self.fns = fns + + def forward(self, x: Tensor) -> Tensor: + """Applies blocks to input.""" + for f, g in self.fns: + x = x + f(x) + x = x + g(x) + return x |