_target_: text_recognizer.network.mammut.MaMMUT encoder: _target_: text_recognizer.network.cvit.CVit image_height: 7 image_width: 128 patch_height: 7 patch_width: 1 dim: &dim 512 encoder: _target_: text_recognizer.network.transformer.encoder.Encoder dim: *dim heads: 8 dim_head: 64 ff_mult: 4 depth: 2 dropout_rate: 0.5 use_rotary_emb: true one_kv_head: true stem: _target_: text_recognizer.network.convnext.convnext.ConvNext dim: 16 dim_mults: [2, 8, 32] depths: [2, 2, 4] attn: null channels: 512 image_attn_pool: _target_: text_recognizer.network.transformer.attention.Attention dim: *dim heads: 8 causal: false dim_head: 64 ff_mult: 4 dropout_rate: 0.0 use_flash: true norm_context: true use_rotary_emb: false one_kv_head: true decoder: _target_: text_recognizer.network.transformer.decoder.Decoder dim: *dim ff_mult: 4 heads: 8 dim_head: 64 depth: 6 dropout_rate: 0.5 one_kv_head: true dim: *dim dim_latent: *dim num_tokens: 57 pad_index: 3 num_image_queries: 64