_target_: text_recognizer.networks.convnext.ConvNext
dim: 16
dim_mults: [2, 4, 8]
depths: [3, 3, 6]
downsampling_factors: [[2, 2], [2, 2], [2, 2]]
attn:
  _target_: text_recognizer.networks.convnext.TransformerBlock
  attn:
    _target_: text_recognizer.networks.convnext.Attention
    dim: 128
    heads: 4
    dim_head: 64
    scale: 8
  ff:
    _target_: text_recognizer.networks.convnext.FeedForward
    dim: 128
    mult: 4