summaryrefslogtreecommitdiff
path: root/text_recognizer
diff options
context:
space:
mode:
Diffstat (limited to 'text_recognizer')
-rw-r--r--text_recognizer/networks/transformer/decoder.py77
-rw-r--r--text_recognizer/networks/transformer/decoder_block.py46
2 files changed, 50 insertions, 73 deletions
diff --git a/text_recognizer/networks/transformer/decoder.py b/text_recognizer/networks/transformer/decoder.py
index efa1e89..c7da226 100644
--- a/text_recognizer/networks/transformer/decoder.py
+++ b/text_recognizer/networks/transformer/decoder.py
@@ -5,88 +5,19 @@ from typing import Optional, Type
from torch import Tensor, nn
from text_recognizer.networks.transformer.attention import Attention
+from text_recognizer.networks.transformer.decoder_block import DecoderBlock
from text_recognizer.networks.transformer.ff import FeedForward
-class DecoderBlock(nn.Module):
- """Decoder block."""
-
- def __init__(
- self,
- self_attn: Attention,
- norm: Type[nn.Module],
- ff: FeedForward,
- cross_attn: Optional[Attention] = None,
- ) -> None:
- super().__init__()
- self.layers = ("self_attn", "cross_attn", "ff")
- self.has_pos_emb = self_attn.rotary_embedding is not None
- self.blocks = self._build(self_attn, norm, ff, cross_attn)
-
- def _build(
- self,
- self_attn: Attention,
- norm: Type[nn.Module],
- ff: FeedForward,
- cross_attn: Optional[Attention],
- ) -> nn.ModuleDict:
- return nn.ModuleDict(
- {
- self.layers[0]: nn.ModuleList([norm, self_attn]),
- self.layers[1]: nn.ModuleList([deepcopy(norm), cross_attn]),
- self.layers[2]: nn.ModuleList([deepcopy(norm), ff]),
- }
- )
-
- def _apply_block(
- self,
- layer: str,
- x: Tensor,
- context: Optional[Tensor] = None,
- input_mask: Optional[Tensor] = None,
- context_mask: Optional[Tensor] = None,
- ) -> Tensor:
- """Applies block function."""
- residual = x
- norm_fn, layer_fn = self.blocks[layer]
- if layer == "self_attn":
- out = layer_fn(x=x, input_mask=input_mask)
- elif layer == "cross_attn":
- out = layer_fn(
- x=x, context=context, input_mask=input_mask, context_mask=context_mask
- )
- else:
- out = layer_fn(x)
- out += residual
- return norm_fn(out)
-
- def forward(
- self,
- x: Tensor,
- context: Optional[Tensor] = None,
- input_mask: Optional[Tensor] = None,
- context_mask: Optional[Tensor] = None,
- ) -> Tensor:
- """Applies decoder block on input signals."""
- for layer in self.layers:
- x = self._apply_block(
- layer=layer,
- x=x,
- context=context,
- input_mask=input_mask,
- context_mask=context_mask,
- )
- return x
-
-
class Decoder(nn.Module):
"""Decoder Network."""
- def __init__(self, depth: int, block: DecoderBlock) -> None:
+ def __init__(self, depth: int, dim: int, block: DecoderBlock) -> None:
super().__init__()
self.depth = depth
self.has_pos_emb = block.has_pos_emb
self.blocks = nn.ModuleList([deepcopy(block) for _ in range(self.depth)])
+ self.ln = nn.LayerNorm(dim)
def forward(
self,
@@ -100,4 +31,4 @@ class Decoder(nn.Module):
x = block(
x=x, context=context, input_mask=input_mask, context_mask=context_mask
)
- return x
+ return self.ln(x)
diff --git a/text_recognizer/networks/transformer/decoder_block.py b/text_recognizer/networks/transformer/decoder_block.py
new file mode 100644
index 0000000..e6e7fb8
--- /dev/null
+++ b/text_recognizer/networks/transformer/decoder_block.py
@@ -0,0 +1,46 @@
+"""Transformer decoder module."""
+from copy import deepcopy
+from typing import Optional, Type
+
+from torch import Tensor, nn
+
+from text_recognizer.networks.transformer.attention import Attention
+from text_recognizer.networks.transformer.ff import FeedForward
+
+
+class DecoderBlock(nn.Module):
+ """Decoder block."""
+
+ def __init__(
+ self,
+ self_attn: Attention,
+ norm: Type[nn.Module],
+ ff: FeedForward,
+ cross_attn: Optional[Attention] = None,
+ ) -> None:
+ super().__init__()
+ self.ln_attn = norm
+ self.attn = self_attn
+ self.ln_cross_attn = deepcopy(norm)
+ self.cross_attn = cross_attn
+ self.ln_ff = deepcopy(norm)
+ self.ff = ff
+ self.has_pos_emb = self.attn.rotary_embedding is not None
+
+ def forward(
+ self,
+ x: Tensor,
+ context: Optional[Tensor] = None,
+ input_mask: Optional[Tensor] = None,
+ context_mask: Optional[Tensor] = None,
+ ) -> Tensor:
+ """Applies decoder block on input signals."""
+ x = x + self.attn(self.ln_attn(x), input_mask=input_mask)
+ x += self.cross_attn(
+ x=self.ln_cross_attn(x),
+ context=context,
+ input_mask=input_mask,
+ context_mask=context_mask,
+ )
+ x += self.ff(self.ln_ff(x))
+ return x