diff options
Diffstat (limited to 'training/conf/network')
-rw-r--r-- | training/conf/network/conv_perceiver.yaml | 37 | ||||
-rw-r--r-- | training/conf/network/vq_transformer.yaml | 65 |
2 files changed, 0 insertions, 102 deletions
diff --git a/training/conf/network/conv_perceiver.yaml b/training/conf/network/conv_perceiver.yaml deleted file mode 100644 index 2e12db9..0000000 --- a/training/conf/network/conv_perceiver.yaml +++ /dev/null @@ -1,37 +0,0 @@ -_target_: text_recognizer.networks.ConvPerceiver -input_dims: [1, 1, 576, 640] -hidden_dim: &hidden_dim 128 -num_classes: &num_classes 58 -max_length: &max_length 89 -num_queries: *max_length -queries_dim: &queries_dim 64 -pad_index: 3 -encoder: - _target_: text_recognizer.networks.EfficientNet - arch: b0 - stochastic_dropout_rate: 0.2 - bn_momentum: 0.99 - bn_eps: 1.0e-3 - depth: 5 - out_channels: *hidden_dim -decoder: - _target_: text_recognizer.networks.perceiver.PerceiverIO - dim: 192 - cross_heads: 1 - cross_head_dim: 64 - num_latents: 256 - latent_dim: 512 - latent_heads: 8 - depth: 6 - queries_dim: 128 - logits_dim: *num_classes -pixel_embedding: - _target_: text_recognizer.networks.transformer.embeddings.axial.AxialPositionalEmbeddingImage - dim: 64 - axial_shape: [3, 64] - axial_dims: [32, 32] -query_pos_emb: - _target_: text_recognizer.networks.transformer.embeddings.absolute.AbsolutePositionalEmbedding - dim: 64 - max_seq_len: *max_length - l2norm_embed: true diff --git a/training/conf/network/vq_transformer.yaml b/training/conf/network/vq_transformer.yaml deleted file mode 100644 index d62a4b7..0000000 --- a/training/conf/network/vq_transformer.yaml +++ /dev/null @@ -1,65 +0,0 @@ -_target_: text_recognizer.networks.VqTransformer -input_dims: [1, 1, 576, 640] -hidden_dim: &hidden_dim 144 -num_classes: 58 -pad_index: 3 -encoder: - _target_: text_recognizer.networks.EfficientNet - arch: b0 - stochastic_dropout_rate: 0.2 - bn_momentum: 0.99 - bn_eps: 1.0e-3 - depth: 5 - out_channels: *hidden_dim -decoder: - _target_: text_recognizer.networks.transformer.Decoder - depth: 6 - block: - _target_: text_recognizer.networks.transformer.DecoderBlock - self_attn: - _target_: text_recognizer.networks.transformer.Attention - dim: *hidden_dim - num_heads: 8 - dim_head: 64 - dropout_rate: &dropout_rate 0.4 - causal: true - rotary_embedding: - _target_: text_recognizer.networks.transformer.RotaryEmbedding - dim: 64 - cross_attn: - _target_: text_recognizer.networks.transformer.Attention - dim: *hidden_dim - num_heads: 8 - dim_head: 64 - dropout_rate: *dropout_rate - causal: false - norm: - _target_: text_recognizer.networks.transformer.RMSNorm - dim: *hidden_dim - ff: - _target_: text_recognizer.networks.transformer.FeedForward - dim: *hidden_dim - dim_out: null - expansion_factor: 2 - glu: true - dropout_rate: *dropout_rate -pixel_embedding: - _target_: text_recognizer.networks.transformer.AxialPositionalEmbedding - dim: *hidden_dim - shape: [18, 79] -quantizer: - _target_: text_recognizer.networks.quantizer.VectorQuantizer - input_dim: *hidden_dim - codebook: - _target_: text_recognizer.networks.quantizer.CosineSimilarityCodebook - dim: 16 - codebook_size: 64 - kmeans_init: true - kmeans_iters: 10 - decay: 0.8 - eps: 1.0e-5 - threshold_dead: 2 - temperature: 0.0 - commitment: 0.25 - ort_reg_weight: 10 - ort_reg_max_codes: 64 |