_target_: text_recognizer.networks.ConvPerceiver input_dims: [1, 1, 576, 640] hidden_dim: &hidden_dim 144 num_classes: &num_classes 58 queries_dim: &queries_dim 16 max_length: 89 pad_index: 3 encoder: _target_: text_recognizer.networks.EfficientNet arch: b0 stochastic_dropout_rate: 0.2 bn_momentum: 0.99 bn_eps: 1.0e-3 depth: 5 out_channels: *hidden_dim decoder: _target_: text_recognizer.networks.perceiver.PerceiverIO dim: *hidden_dim cross_heads: 1 cross_head_dim: 64 num_latents: 256 latent_dim: 512 latent_heads: 8 depth: 6 queries_dim: *queries_dim logits_dim: *num_classes pixel_embedding: _target_: text_recognizer.networks.transformer.AxialPositionalEmbedding dim: *hidden_dim shape: [3, 64]