blob: f32cb83506aff0a198f158c2b801369f1c57a381 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
_target_: text_recognizer.network.vit.VisionTransformer
image_height: 56
image_width: 1024
patch_height: 28
patch_width: 32
dim: &dim 1024
num_classes: &num_classes 58
encoder:
_target_: text_recognizer.network.transformer.encoder.Encoder
dim: *dim
inner_dim: 2048
heads: 16
dim_head: 64
depth: 6
dropout_rate: 0.0
decoder:
_target_: text_recognizer.network.transformer.decoder.Decoder
dim: *dim
inner_dim: 2048
heads: 16
dim_head: 64
depth: 6
dropout_rate: 0.0
token_embedding:
_target_: "text_recognizer.network.transformer.embedding.token.\
TokenEmbedding"
num_tokens: *num_classes
dim: *dim
use_l2: true
pos_embedding:
_target_: "text_recognizer.network.transformer.embedding.absolute.\
AbsolutePositionalEmbedding"
dim: *dim
max_length: 89
use_l2: true
tie_embeddings: false
pad_index: 3
|