diff options
Diffstat (limited to 'text_recognizer/networks/transformer')
4 files changed, 205 insertions, 0 deletions
diff --git a/text_recognizer/networks/transformer/axial_attention/__init__.py b/text_recognizer/networks/transformer/axial_attention/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/__init__.py diff --git a/text_recognizer/networks/transformer/axial_attention/encoder.py b/text_recognizer/networks/transformer/axial_attention/encoder.py new file mode 100644 index 0000000..3082bd6 --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/encoder.py @@ -0,0 +1,80 @@ +"""Axial transformer encoder.""" + +from typing import List + +import attr +from torch import nn, Tensor + +from text_recognizer.networks.transformer.axial_attention.self_attention import ( + SelfAttention, +) +from text_recognizer.networks.transformer.axial_attention.utils import ( + calculate_permutations, + PermuteToForm, + Sequential, +) +from text_recognizer.networks.transformer.norm import PreNorm + + +@attr.s(eq=False) +class AxialEncoder(nn.Module): + """Axial transfomer encoder.""" + + def __attrs_pre_init__(self) -> None: + super().__init__() + + shape: List[int] = attr.ib() + dim: int = attr.ib() + depth: int = attr.ib() + heads: int = attr.ib() + dim_head: int = attr.ib() + dim_index: int = attr.ib() + fn: nn.Sequential = attr.ib(init=False) + + def __attrs_post_init__(self) -> None: + self._build() + + def _build(self) -> None: + permutations = calculate_permutations(2, self.dim_index) + get_ff = lambda: nn.Sequential( + nn.LayerNorm([self.dim, *self.shape]), + nn.Conv2d( + in_channels=self.dim, + out_channels=4 * self.dim, + kernel_size=3, + padding=1, + ), + nn.Mish(inplace=True), + nn.Conv2d( + in_channels=4 * self.dim, + out_channels=self.dim, + kernel_size=3, + padding=1, + ), + ) + + layers = nn.ModuleList([]) + for _ in range(self.depth): + attns = nn.ModuleList( + [ + PermuteToForm( + permutation=permutation, + fn=PreNorm( + self.dim, + SelfAttention( + dim=self.dim, heads=self.heads, dim_head=self.dim_head + ), + ), + ) + for permutation in permutations + ] + ) + convs = nn.ModuleList([get_ff(), get_ff()]) + layers.append(attns) + layers.append(convs) + + self.fn = Sequential(layers) + + def forward(self, x: Tensor) -> Tensor: + """Applies fn to input.""" + return self.fn(x) diff --git a/text_recognizer/networks/transformer/axial_attention/self_attention.py b/text_recognizer/networks/transformer/axial_attention/self_attention.py new file mode 100644 index 0000000..ba162be --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/self_attention.py @@ -0,0 +1,45 @@ +"""Axial self attention module.""" + +import attr +import torch +from torch import nn +from torch import Tensor + + +@attr.s(eq=False) +class SelfAttention(nn.Module): + """Axial self attention module.""" + + def __attrs_pre_init__(self) -> None: + super().__init__() + + dim: int = attr.ib() + dim_head: int = attr.ib() + heads: int = attr.ib() + dim_hidden: int = attr.ib(init=False) + to_q: nn.Linear = attr.ib(init=False) + to_kv: nn.Linear = attr.ib(init=False) + to_out: nn.Linear = attr.ib(init=False) + + def __attrs_post_init__(self) -> None: + self.dim_hidden = self.heads * self.dim_head + self.to_q = nn.Linear(self.dim, self.dim_hidden, bias=False) + self.to_kv = nn.Linear(self.dim, 2 * self.dim_hidden, bias=False) + self.to_out = nn.Linear(self.dim_hidden, self.dim) + + def forward(self, x: Tensor) -> Tensor: + """Applies self attention.""" + q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim=-1)) + b, _, d, h, e = *q.shape, self.heads, self.dim_head + + merge_heads = ( + lambda x: x.reshape(b, -1, h, e).transpose(1, 2).reshape(b * h, -1, e) + ) + q, k, v = map(merge_heads, (q, k, v)) + + energy = torch.einsum("bie,bje->bij", q, k) * (e ** -0.5) + energy = energy.softmax(dim=-1) + attn = torch.einsum("bij,bje->bie", energy, v) + + out = attn.reshape(b, h, -1, e).transpose(1, 2).reshape(b, -1, d) + return self.to_out(out) diff --git a/text_recognizer/networks/transformer/axial_attention/utils.py b/text_recognizer/networks/transformer/axial_attention/utils.py new file mode 100644 index 0000000..534ef4e --- /dev/null +++ b/text_recognizer/networks/transformer/axial_attention/utils.py @@ -0,0 +1,80 @@ +"""Helper functions for axial attention.""" +from operator import itemgetter +from typing import Callable, List, Tuple + +import attr +from torch import nn, Tensor + + +def _map_el_ind(arr: Tensor, ind: int) -> List: + return list(map(itemgetter(ind), arr)) + + +def _sort_indices(arr: Tensor) -> Tuple[List[int], List[int]]: + indices = [i for i in range(len(arr))] + arr = zip(arr, indices) + arr = sorted(arr) + return _map_el_ind(arr, 0), _map_el_ind(arr, 1) + + +def calculate_permutations(num_dims: int, emb_dim: int) -> List[List[int]]: + """Returns permutations of tensor.""" + total_dims = num_dims + 2 + axial_dims = [i for i in range(1, total_dims) if i != emb_dim] + + permutations = [] + + for axial_dim in axial_dims: + last_two_dims = [axial_dim, emb_dim] + dims_rest = set(range(0, total_dims)) - set(last_two_dims) + permutation = [*dims_rest, *last_two_dims] + permutations.append(permutation) + + return permutations + + +@attr.s(eq=False) +class PermuteToForm(nn.Module): + """Helper class for applying axial attention.""" + + def __attrs_pre_init__(self) -> None: + super().__init__() + + fn: Callable = attr.ib() + permutation: List[List[int]] = attr.ib() + inv_permutation: List[List[int]] = attr.ib(init=False) + + def __attrs_post_init__(self) -> None: + _, self.inv_permutation = _sort_indices(self.permutation) + + def forward(self, x: Tensor) -> Tensor: + """Permutes tensor, applies axial attention, permutes tensor back.""" + x = x.permute(*self.permutation).contiguous() + shape = x.shape + *_, t, d = shape + + # Merge all but axial dimension + x = x.reshape(-1, t, d) + + # Apply attention + x = self.fn(x) + + # Restore original shape and permutation + x = x.reshape(*shape) + x = x.permute(*self.inv_permutation).contiguous() + return x + + +class Sequential(nn.Module): + """Applies a list of paired functions to input.""" + + def __init__(self, fns: nn.ModuleList) -> None: + super().__init__() + self.fns = fns + + def forward(self, x: Tensor) -> Tensor: + """Applies blocks to input.""" + for f, g in self.fns: + x = x + f(x) + x = x + g(x) + return x |