summaryrefslogtreecommitdiff
path: root/text_recognizer/networks/transformer
diff options
context:
space:
mode:
Diffstat (limited to 'text_recognizer/networks/transformer')
-rw-r--r--text_recognizer/networks/transformer/attention.py7
-rw-r--r--text_recognizer/networks/transformer/layers.py6
2 files changed, 6 insertions, 7 deletions
diff --git a/text_recognizer/networks/transformer/attention.py b/text_recognizer/networks/transformer/attention.py
index 9202cce..37ce29e 100644
--- a/text_recognizer/networks/transformer/attention.py
+++ b/text_recognizer/networks/transformer/attention.py
@@ -15,7 +15,7 @@ from text_recognizer.networks.transformer.positional_encodings.rotary_embedding
)
-@attr.s
+@attr.s(eq=False)
class Attention(nn.Module):
"""Standard attention."""
@@ -31,7 +31,6 @@ class Attention(nn.Module):
dropout: nn.Dropout = attr.ib(init=False)
fc: nn.Linear = attr.ib(init=False)
qkv_fn: nn.Sequential = attr.ib(init=False)
- attn_fn: F.softmax = attr.ib(init=False, default=F.softmax)
def __attrs_post_init__(self) -> None:
"""Post init configuration."""
@@ -80,7 +79,7 @@ class Attention(nn.Module):
else k_mask
)
q_mask = rearrange(q_mask, "b i -> b () i ()")
- k_mask = rearrange(k_mask, "b i -> b () () j")
+ k_mask = rearrange(k_mask, "b j -> b () () j")
return q_mask * k_mask
return
@@ -129,7 +128,7 @@ class Attention(nn.Module):
if self.causal:
energy = self._apply_causal_mask(energy, mask, mask_value, device)
- attn = self.attn_fn(energy, dim=-1)
+ attn = F.softmax(energy, dim=-1)
attn = self.dropout(attn)
out = einsum("b h i j, b h j d -> b h i d", attn, v)
out = rearrange(out, "b h n d -> b n (h d)")
diff --git a/text_recognizer/networks/transformer/layers.py b/text_recognizer/networks/transformer/layers.py
index 66c9c50..ce443e5 100644
--- a/text_recognizer/networks/transformer/layers.py
+++ b/text_recognizer/networks/transformer/layers.py
@@ -12,7 +12,7 @@ from text_recognizer.networks.transformer.positional_encodings.rotary_embedding
from text_recognizer.networks.util import load_partial_fn
-@attr.s
+@attr.s(eq=False)
class AttentionLayers(nn.Module):
"""Standard transfomer layer."""
@@ -101,11 +101,11 @@ class AttentionLayers(nn.Module):
return x
-@attr.s(auto_attribs=True)
+@attr.s(auto_attribs=True, eq=False)
class Encoder(AttentionLayers):
causal: bool = attr.ib(default=False, init=False)
-@attr.s(auto_attribs=True)
+@attr.s(auto_attribs=True, eq=False)
class Decoder(AttentionLayers):
causal: bool = attr.ib(default=True, init=False)