blob: 638dae16adca91577613f2a401c499730764bbd7 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
_target_: text_recognizer.network.convformer.Convformer
image_height: 7
image_width: 128
patch_height: 7
patch_width: 1
dim: &dim 768
num_classes: &num_classes 58
encoder:
_target_: text_recognizer.network.transformer.encoder.Encoder
dim: *dim
inner_dim: 3072
ff_mult: 4
heads: 12
dim_head: 64
depth: 4
dropout_rate: 0.1
decoder:
_target_: text_recognizer.network.transformer.decoder.Decoder
dim: *dim
inner_dim: 3072
ff_mult: 4
heads: 12
dim_head: 64
depth: 6
dropout_rate: 0.1
token_embedding:
_target_: "text_recognizer.network.transformer.embedding.token.\
TokenEmbedding"
num_tokens: *num_classes
dim: *dim
use_l2: true
tie_embeddings: true
pad_index: 3
channels: 64
stem:
_target_: text_recognizer.network.convnext.convnext.ConvNext
dim: 8
dim_mults: [2, 8, 8]
depths: [2, 2, 2]
attn: null
# _target_: text_recognizer.network.convnext.transformer.Transformer
# attn:
# _target_: text_recognizer.network.convnext.transformer.Attention
# dim: 64
# heads: 4
# dim_head: 64
# scale: 8
# ff:
# _target_: text_recognizer.network.convnext.transformer.FeedForward
# dim: 64
# mult: 4
|