_target_: text_recognizer.networks.ConvPerceiver input_dims: [1, 1, 576, 640] hidden_dim: &hidden_dim 128 num_classes: &num_classes 58 max_length: &max_length 89 num_queries: *max_length queries_dim: &queries_dim 64 pad_index: 3 encoder: _target_: text_recognizer.networks.EfficientNet arch: b0 stochastic_dropout_rate: 0.2 bn_momentum: 0.99 bn_eps: 1.0e-3 depth: 5 out_channels: *hidden_dim decoder: _target_: text_recognizer.networks.perceiver.PerceiverIO dim: 192 cross_heads: 1 cross_head_dim: 64 num_latents: 256 latent_dim: 512 latent_heads: 8 depth: 6 queries_dim: 128 logits_dim: *num_classes pixel_embedding: _target_: text_recognizer.networks.transformer.embeddings.axial.AxialPositionalEmbeddingImage dim: 64 axial_shape: [3, 64] axial_dims: [32, 32] query_pos_emb: _target_: text_recognizer.networks.transformer.embeddings.absolute.AbsolutePositionalEmbedding dim: 64 max_seq_len: *max_length l2norm_embed: true