From e643e0c61ab33ce1bb8cfdebc92fc0670c82afda Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Mon, 15 Apr 2024 21:48:18 +0200 Subject: Update configs --- training/conf/network/mammut_cvit_lines.yaml | 51 ++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 training/conf/network/mammut_cvit_lines.yaml (limited to 'training/conf/network/mammut_cvit_lines.yaml') diff --git a/training/conf/network/mammut_cvit_lines.yaml b/training/conf/network/mammut_cvit_lines.yaml new file mode 100644 index 0000000..75fcccb --- /dev/null +++ b/training/conf/network/mammut_cvit_lines.yaml @@ -0,0 +1,51 @@ +_target_: text_recognizer.network.mammut.MaMMUT +encoder: + _target_: text_recognizer.network.cvit.CVit + image_height: 7 + image_width: 128 + patch_height: 7 + patch_width: 1 + dim: &dim 512 + encoder: + _target_: text_recognizer.network.transformer.encoder.Encoder + dim: *dim + heads: 8 + dim_head: 64 + ff_mult: 4 + depth: 2 + dropout_rate: 0.5 + use_rotary_emb: true + one_kv_head: true + stem: + _target_: text_recognizer.network.convnext.convnext.ConvNext + dim: 16 + dim_mults: [2, 8, 32] + depths: [2, 2, 4] + attn: null + channels: 512 +image_attn_pool: + _target_: text_recognizer.network.transformer.attention.Attention + dim: *dim + heads: 8 + causal: false + dim_head: 64 + ff_mult: 4 + dropout_rate: 0.0 + use_flash: true + norm_context: true + use_rotary_emb: false + one_kv_head: true +decoder: + _target_: text_recognizer.network.transformer.decoder.Decoder + dim: *dim + ff_mult: 4 + heads: 8 + dim_head: 64 + depth: 6 + dropout_rate: 0.5 + one_kv_head: true +dim: *dim +dim_latent: *dim +num_tokens: 57 +pad_index: 3 +num_image_queries: 64 -- cgit v1.2.3-70-g09d2