summaryrefslogtreecommitdiff
path: root/text_recognizer/network/transformer/embedding
diff options
context:
space:
mode:
Diffstat (limited to 'text_recognizer/network/transformer/embedding')
-rw-r--r--text_recognizer/network/transformer/embedding/__init__.py1
-rw-r--r--text_recognizer/network/transformer/embedding/absolute.py28
-rw-r--r--text_recognizer/network/transformer/embedding/l2_norm.py9
-rw-r--r--text_recognizer/network/transformer/embedding/sincos.py13
-rw-r--r--text_recognizer/network/transformer/embedding/token.py18
5 files changed, 69 insertions, 0 deletions
diff --git a/text_recognizer/network/transformer/embedding/__init__.py b/text_recognizer/network/transformer/embedding/__init__.py
new file mode 100644
index 0000000..bb3f904
--- /dev/null
+++ b/text_recognizer/network/transformer/embedding/__init__.py
@@ -0,0 +1 @@
+"""Positional encodings for transformers."""
diff --git a/text_recognizer/network/transformer/embedding/absolute.py b/text_recognizer/network/transformer/embedding/absolute.py
new file mode 100644
index 0000000..08b2c2a
--- /dev/null
+++ b/text_recognizer/network/transformer/embedding/absolute.py
@@ -0,0 +1,28 @@
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+from text_recognizer.network.transformer.embedding.l2_norm import l2_norm
+
+
+class AbsolutePositionalEmbedding(nn.Module):
+ def __init__(self, dim: int, max_length: int, use_l2: bool = False) -> None:
+ super().__init__()
+ self.scale = dim**-0.5 if not use_l2 else 1.0
+ self.max_length = max_length
+ self.use_l2 = use_l2
+ self.to_embedding = nn.Embedding(max_length, dim)
+ if self.use_l2:
+ nn.init.normal_(self.to_embedding.weight, std=1e-5)
+
+ def forward(self, x: Tensor, pos: Optional[Tensor] = None) -> Tensor:
+ n, device = x.shape[1], x.device
+ assert (
+ n <= self.max_length
+ ), f"Sequence length {n} is greater than the maximum positional embedding {self.max_length}"
+
+ if pos is None:
+ pos = torch.arange(n, device=device)
+
+ embedding = self.to_embedding(pos) * self.scale
+ return l2_norm(embedding) if self.use_l2 else embedding
diff --git a/text_recognizer/network/transformer/embedding/l2_norm.py b/text_recognizer/network/transformer/embedding/l2_norm.py
new file mode 100644
index 0000000..0e48bca
--- /dev/null
+++ b/text_recognizer/network/transformer/embedding/l2_norm.py
@@ -0,0 +1,9 @@
+from einops import rearrange
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def l2_norm(t: Tensor, groups=1) -> Tensor:
+ t = rearrange(t, "... (g d) -> ... g d", g=groups)
+ t = F.normalize(t, p=2, dim=-1)
+ return rearrange(t, "... g d -> ... (g d)")
diff --git a/text_recognizer/network/transformer/embedding/sincos.py b/text_recognizer/network/transformer/embedding/sincos.py
new file mode 100644
index 0000000..ed6b0ab
--- /dev/null
+++ b/text_recognizer/network/transformer/embedding/sincos.py
@@ -0,0 +1,13 @@
+import torch
+
+
+def sincos_2d(h, w, dim, temperature: int = 10000, dtype=torch.float32):
+ y, x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+ assert (dim % 4) == 0, "feature dimension must be multiple of 4 for sincos emb"
+ omega = torch.arange(dim // 4) / (dim // 4 - 1)
+ omega = 1.0 / (temperature**omega)
+
+ y = y.flatten()[:, None] * omega[None, :]
+ x = x.flatten()[:, None] * omega[None, :]
+ pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
+ return pe.type(dtype)
diff --git a/text_recognizer/network/transformer/embedding/token.py b/text_recognizer/network/transformer/embedding/token.py
new file mode 100644
index 0000000..1df2fd6
--- /dev/null
+++ b/text_recognizer/network/transformer/embedding/token.py
@@ -0,0 +1,18 @@
+from torch import nn, Tensor
+
+from text_recognizer.network.transformer.embedding.l2_norm import l2_norm
+
+
+class TokenEmbedding(nn.Module):
+ def __init__(self, num_tokens: int, dim: int, use_l2: bool = True) -> None:
+ super().__init__()
+ self.use_l2 = use_l2
+ self.to_embedding = nn.Embedding(num_tokens, dim)
+ if self.use_l2:
+ nn.init.normal_(self.to_embedding.weight, std=1e-5)
+ else:
+ nn.init.kaiming_normal_(self.to_embedding.weight)
+
+ def forward(self, x: Tensor) -> Tensor:
+ embedding = self.to_embedding(x)
+ return l2_norm(embedding) if self.use_l2 else embedding