blob: 75fcccb17a4ef8755b0188ae1ea7f15380e4502f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
_target_: text_recognizer.network.mammut.MaMMUT
encoder:
_target_: text_recognizer.network.cvit.CVit
image_height: 7
image_width: 128
patch_height: 7
patch_width: 1
dim: &dim 512
encoder:
_target_: text_recognizer.network.transformer.encoder.Encoder
dim: *dim
heads: 8
dim_head: 64
ff_mult: 4
depth: 2
dropout_rate: 0.5
use_rotary_emb: true
one_kv_head: true
stem:
_target_: text_recognizer.network.convnext.convnext.ConvNext
dim: 16
dim_mults: [2, 8, 32]
depths: [2, 2, 4]
attn: null
channels: 512
image_attn_pool:
_target_: text_recognizer.network.transformer.attention.Attention
dim: *dim
heads: 8
causal: false
dim_head: 64
ff_mult: 4
dropout_rate: 0.0
use_flash: true
norm_context: true
use_rotary_emb: false
one_kv_head: true
decoder:
_target_: text_recognizer.network.transformer.decoder.Decoder
dim: *dim
ff_mult: 4
heads: 8
dim_head: 64
depth: 6
dropout_rate: 0.5
one_kv_head: true
dim: *dim
dim_latent: *dim
num_tokens: 57
pad_index: 3
num_image_queries: 64
|