diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-15 21:48:18 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-15 21:48:18 +0200 |
commit | e643e0c61ab33ce1bb8cfdebc92fc0670c82afda (patch) | |
tree | 10876c66ed249429e42f1660231bb7ff84c10769 /training/conf/network/vit_lines.yaml | |
parent | 9ce21f569ecac03f15f2ad669fde3dd4a512f8cc (diff) |
Update configs
Diffstat (limited to 'training/conf/network/vit_lines.yaml')
-rw-r--r-- | training/conf/network/vit_lines.yaml | 56 |
1 files changed, 22 insertions, 34 deletions
diff --git a/training/conf/network/vit_lines.yaml b/training/conf/network/vit_lines.yaml index 638dae1..a8045c2 100644 --- a/training/conf/network/vit_lines.yaml +++ b/training/conf/network/vit_lines.yaml @@ -1,51 +1,39 @@ -_target_: text_recognizer.network.convformer.Convformer -image_height: 7 -image_width: 128 -patch_height: 7 -patch_width: 1 +_target_: text_recognizer.network.transformer.transformer.Transformer dim: &dim 768 -num_classes: &num_classes 58 +num_classes: &num_classes 57 encoder: - _target_: text_recognizer.network.transformer.encoder.Encoder + _target_: text_recognizer.network.transformer.vit.Vit + image_height: 56 + image_width: 1024 + patch_height: 56 + patch_width: 8 dim: *dim - inner_dim: 3072 - ff_mult: 4 - heads: 12 - dim_head: 64 - depth: 4 - dropout_rate: 0.1 + encoder: + _target_: text_recognizer.network.transformer.encoder.Encoder + dim: *dim + heads: 16 + dim_head: 64 + ff_mult: 4 + depth: 6 + dropout_rate: 0. + use_rotary_emb: true + one_kv_head: false + channels: 1 + patch_dropout: 0.4 decoder: _target_: text_recognizer.network.transformer.decoder.Decoder dim: *dim - inner_dim: 3072 ff_mult: 4 heads: 12 dim_head: 64 depth: 6 - dropout_rate: 0.1 + dropout_rate: 0. + one_kv_head: false token_embedding: _target_: "text_recognizer.network.transformer.embedding.token.\ TokenEmbedding" num_tokens: *num_classes dim: *dim use_l2: true -tie_embeddings: true +tie_embeddings: false pad_index: 3 -channels: 64 -stem: - _target_: text_recognizer.network.convnext.convnext.ConvNext - dim: 8 - dim_mults: [2, 8, 8] - depths: [2, 2, 2] - attn: null - # _target_: text_recognizer.network.convnext.transformer.Transformer - # attn: - # _target_: text_recognizer.network.convnext.transformer.Attention - # dim: 64 - # heads: 4 - # dim_head: 64 - # scale: 8 - # ff: - # _target_: text_recognizer.network.convnext.transformer.FeedForward - # dim: 64 - # mult: 4 |