1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
"""A Transformer with a cnn backbone.
The network encodes a image with a convolutional backbone to a latent representation,
i.e. feature maps. A 2d positional encoding is applied to the feature maps for
spatial information. The resulting feature are then set to a transformer decoder
together with the target tokens.
TODO: Local attention for transformer.j
"""
import math
from typing import Any, Dict, List, Optional, Sequence, Type
from einops import rearrange
import torch
from torch import nn
from torch import Tensor
import torchvision
from text_recognizer.data.emnist import emnist_mapping
from text_recognizer.networks.transformer import (
Decoder,
DecoderLayer,
PositionalEncoding,
PositionalEncoding2D,
target_padding_mask,
)
class ImageTransformer(nn.Module):
def __init__(
self,
input_shape: Sequence[int],
output_shape: Sequence[int],
backbone: Type[nn.Module],
mapping: Optional[List[str]] = None,
num_decoder_layers: int = 4,
hidden_dim: int = 256,
num_heads: int = 4,
expansion_dim: int = 4,
dropout_rate: float = 0.1,
transformer_activation: str = "glu",
) -> None:
# Configure mapping
mapping, inverse_mapping = self._configure_mapping(mapping)
self.vocab_size = len(mapping)
self.hidden_dim = hidden_dim
self.max_output_length = output_shape[0]
self.start_index = inverse_mapping["<s>"]
self.end_index = inverse_mapping["<e>"]
self.pad_index = inverse_mapping["<p>"]
# Image backbone
self.backbone = backbone
self.latent_encoding = PositionalEncoding2D(hidden_dim=hidden_dim, max_h=input_shape[1], max_w=input_shape[2])
# Target token embedding
self.trg_embedding = nn.Embedding(self.vocab_size, hidden_dim)
self.trg_position_encoding = PositionalEncoding(hidden_dim, dropout_rate)
# Transformer decoder
self.decoder = Decoder(
decoder_layer=DecoderLayer(
hidden_dim=hidden_dim,
num_heads=num_heads,
expansion_dim=expansion_dim,
dropout_rate=dropout_rate,
activation=transformer_activation,
),
num_layers=num_decoder_layers,
norm=nn.LayerNorm(hidden_dim),
)
# Classification head
self.head = nn.Linear(hidden_dim, self.vocab_size)
# Initialize weights
self._init_weights()
def _init_weights(self) -> None:
"""Initialize network weights."""
self.trg_embedding.weight.data.uniform_(-0.1, 0.1)
self.head.bias.data.zero_()
self.head.weight.data.uniform_(-0.1, 0.1)
nn.init.kaiming_normal_(self.latent_encoding.weight.data, a=0, mode="fan_out", nonlinearity="relu")
if self.latent_encoding.bias is not None:
_, fan_out = nn.init._calculate_fan_in_and_fan_out(self.latent_encoding.weight.data)
bound = 1 / math.sqrt(fan_out)
nn.init.normal_(self.latent_encoding.bias, -bound, bound)
def _configure_mapping(self, mapping: Optional[List[str]]) -> Tuple[List[str], Dict[str, int]]:
"""Configures mapping."""
if mapping is None:
mapping, inverse_mapping, _ = emnist_mapping()
return mapping, inverse_mapping
def encode(self, image: Tensor) -> Tensor:
"""Extracts image features with backbone.
Args:
image (Tensor): Image(s) of handwritten text.
Retuns:
Tensor: Image features.
Shapes:
- image: :math: `(B, C, H, W)`
- latent: :math: `(B, T, C)`
"""
# Extract image features.
latent = self.backbone(image)
# Add 2d encoding to the feature maps.
latent = self.latent_encoding(latent)
# Collapse features maps height and width.
latent = rearrange(latent, "b c h w -> b (h w) c")
return latent
def decode(self, memory: Tensor, trg: Tensor) -> Tensor:
"""Decodes image features with transformer decoder."""
trg_mask = target_padding_mask(trg=trg, pad_index=self.pad_index)
trg = self.trg_embedding(trg) * math.sqrt(self.hidden_dim)
trg = self.trg_position_encoding(trg)
out = self.decoder(trg=trg, memory=memory, trg_mask=trg_mask, memory_mask=None)
logits = self.head(out)
return logits
def predict(self, image: Tensor) -> Tensor:
"""Transcribes text in image(s)."""
bsz = image.shape[0]
image_features = self.encode(image)
output_tokens = (torch.ones((bsz, self.max_output_length)) * self.pad_index).type_as(image).long()
output_tokens[:, 0] = self.start_index
for i in range(1, self.max_output_length):
trg = output_tokens[:, :i]
output = self.decode(image_features, trg)
output = torch.argmax(output, dim=-1)
output_tokens[:, i] = output[-1:]
# Set all tokens after end token to be padding.
for i in range(1, self.max_output_length):
indices = (output_tokens[:, i - 1] == self.end_index | (output_tokens[:, i - 1] == self.pad_index))
output_tokens[indices, i] = self.pad_index
return output_tokens
|