summaryrefslogtreecommitdiff
path: root/training/conf/network/vit_lines.yaml
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-15 21:48:18 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-15 21:48:18 +0200
commite643e0c61ab33ce1bb8cfdebc92fc0670c82afda (patch)
tree10876c66ed249429e42f1660231bb7ff84c10769 /training/conf/network/vit_lines.yaml
parent9ce21f569ecac03f15f2ad669fde3dd4a512f8cc (diff)
Update configs
Diffstat (limited to 'training/conf/network/vit_lines.yaml')
-rw-r--r--training/conf/network/vit_lines.yaml56
1 files changed, 22 insertions, 34 deletions
diff --git a/training/conf/network/vit_lines.yaml b/training/conf/network/vit_lines.yaml
index 638dae1..a8045c2 100644
--- a/training/conf/network/vit_lines.yaml
+++ b/training/conf/network/vit_lines.yaml
@@ -1,51 +1,39 @@
-_target_: text_recognizer.network.convformer.Convformer
-image_height: 7
-image_width: 128
-patch_height: 7
-patch_width: 1
+_target_: text_recognizer.network.transformer.transformer.Transformer
dim: &dim 768
-num_classes: &num_classes 58
+num_classes: &num_classes 57
encoder:
- _target_: text_recognizer.network.transformer.encoder.Encoder
+ _target_: text_recognizer.network.transformer.vit.Vit
+ image_height: 56
+ image_width: 1024
+ patch_height: 56
+ patch_width: 8
dim: *dim
- inner_dim: 3072
- ff_mult: 4
- heads: 12
- dim_head: 64
- depth: 4
- dropout_rate: 0.1
+ encoder:
+ _target_: text_recognizer.network.transformer.encoder.Encoder
+ dim: *dim
+ heads: 16
+ dim_head: 64
+ ff_mult: 4
+ depth: 6
+ dropout_rate: 0.
+ use_rotary_emb: true
+ one_kv_head: false
+ channels: 1
+ patch_dropout: 0.4
decoder:
_target_: text_recognizer.network.transformer.decoder.Decoder
dim: *dim
- inner_dim: 3072
ff_mult: 4
heads: 12
dim_head: 64
depth: 6
- dropout_rate: 0.1
+ dropout_rate: 0.
+ one_kv_head: false
token_embedding:
_target_: "text_recognizer.network.transformer.embedding.token.\
TokenEmbedding"
num_tokens: *num_classes
dim: *dim
use_l2: true
-tie_embeddings: true
+tie_embeddings: false
pad_index: 3
-channels: 64
-stem:
- _target_: text_recognizer.network.convnext.convnext.ConvNext
- dim: 8
- dim_mults: [2, 8, 8]
- depths: [2, 2, 2]
- attn: null
- # _target_: text_recognizer.network.convnext.transformer.Transformer
- # attn:
- # _target_: text_recognizer.network.convnext.transformer.Attention
- # dim: 64
- # heads: 4
- # dim_head: 64
- # scale: 8
- # ff:
- # _target_: text_recognizer.network.convnext.transformer.FeedForward
- # dim: 64
- # mult: 4