_target_: text_recognizer.network.ConvTransformer encoder: _target_: text_recognizer.network.image_encoder.ImageEncoder encoder: _target_: text_recognizer.network.convnext.ConvNext dim: 16 dim_mults: [2, 4, 8] depths: [3, 3, 6] downsampling_factors: [[2, 2], [2, 2], [2, 2]] pixel_embedding: _target_: "text_recognizer.network.transformer.embeddings.axial.\ AxialPositionalEmbeddingImage" dim: &hidden_dim 128 axial_shape: [7, 128] axial_dims: [64, 64] decoder: _target_: text_recognizer.network.text_decoder.TextDecoder hidden_dim: *hidden_dim num_classes: 58 pad_index: 3 decoder: _target_: text_recognizer.network.transformer.Decoder dim: *hidden_dim depth: 10 block: _target_: text_recognizer.network.transformer.decoder_block.DecoderBlock self_attn: _target_: text_recognizer.network.transformer.Attention dim: *hidden_dim num_heads: 12 dim_head: 64 dropout_rate: &dropout_rate 0.2 causal: true cross_attn: _target_: text_recognizer.network.transformer.Attention dim: *hidden_dim num_heads: 12 dim_head: 64 dropout_rate: *dropout_rate causal: false norm: _target_: text_recognizer.network.transformer.RMSNorm dim: *hidden_dim ff: _target_: text_recognizer.network.transformer.FeedForward dim: *hidden_dim dim_out: null expansion_factor: 2 glu: true dropout_rate: *dropout_rate rotary_embedding: _target_: text_recognizer.network.transformer.RotaryEmbedding dim: 64 token_pos_embedding: _target_: "text_recognizer.network.transformer.embeddings.fourier.\ PositionalEncoding" dim: *hidden_dim dropout_rate: 0.1 max_len: 89