_target_: text_recognizer.networks.ConvTransformer input_dims: [1, 1, 576, 640] hidden_dim: &hidden_dim 128 num_classes: 58 pad_index: 3 encoder: _target_: text_recognizer.networks.convnext.ConvNext dim: 16 dim_mults: [2, 4, 8] depths: [3, 3, 6] downsampling_factors: [[2, 2], [2, 2], [2, 2]] decoder: _target_: text_recognizer.networks.transformer.Decoder dim: *hidden_dim depth: 10 block: _target_: text_recognizer.networks.transformer.decoder_block.DecoderBlock self_attn: _target_: text_recognizer.networks.transformer.Attention dim: *hidden_dim num_heads: 12 dim_head: 64 dropout_rate: &dropout_rate 0.2 causal: true rotary_embedding: _target_: text_recognizer.networks.transformer.RotaryEmbedding dim: 64 cross_attn: _target_: text_recognizer.networks.transformer.Attention dim: *hidden_dim num_heads: 12 dim_head: 64 dropout_rate: *dropout_rate causal: false norm: _target_: text_recognizer.networks.transformer.RMSNorm dim: *hidden_dim ff: _target_: text_recognizer.networks.transformer.FeedForward dim: *hidden_dim dim_out: null expansion_factor: 2 glu: true dropout_rate: *dropout_rate pixel_embedding: _target_: "text_recognizer.networks.transformer.embeddings.axial.\ AxialPositionalEmbeddingImage" dim: *hidden_dim axial_shape: [7, 128] axial_dims: [64, 64] token_pos_embedding: _target_: "text_recognizer.networks.transformer.embeddings.fourier.\ PositionalEncoding" dim: *hidden_dim dropout_rate: 0.1 max_len: 89