_target_: text_recognizer.network.mammut.MaMMUT encoder: _target_: text_recognizer.network.vit.Vit image_height: 56 image_width: 1024 patch_height: 56 patch_width: 8 dim: &dim 512 encoder: _target_: text_recognizer.network.transformer.encoder.Encoder dim: *dim heads: 12 dim_head: 64 ff_mult: 4 depth: 6 dropout_rate: 0.1 channels: 1 image_attn_pool: _target_: text_recognizer.network.transformer.attention.Attention dim: *dim heads: 8 causal: false dim_head: 64 ff_mult: 4 dropout_rate: 0.0 use_flash: true norm_context: true rotary_emb: null decoder: _target_: text_recognizer.network.transformer.decoder.Decoder dim: *dim ff_mult: 4 heads: 12 dim_head: 64 depth: 6 dropout_rate: 0.1 dim: *dim dim_latent: *dim num_tokens: 58 pad_index: 3 num_image_queries: 256