_target_: text_recognizer.network.mammut.MaMMUT encoder: _target_: text_recognizer.network.vit.Vit image_height: 56 image_width: 1024 patch_height: 56 patch_width: 2 dim: &dim 512 encoder: _target_: text_recognizer.network.transformer.encoder.Encoder dim: *dim heads: 16 dim_head: 64 ff_mult: 4 depth: 6 dropout_rate: 0. use_rotary_emb: true one_kv_head: true channels: 1 patch_dropout: 0.5 image_attn_pool: _target_: text_recognizer.network.transformer.attention.Attention dim: *dim heads: 8 causal: false dim_head: 64 ff_mult: 4 dropout_rate: 0.0 use_flash: true norm_context: true use_rotary_emb: false one_kv_head: true decoder: _target_: text_recognizer.network.transformer.decoder.Decoder dim: *dim ff_mult: 4 heads: 12 dim_head: 64 depth: 6 dropout_rate: 0. one_kv_head: true dim: *dim dim_latent: *dim num_tokens: 57 pad_index: 3 num_image_queries: 128