In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch.nn.functional as F
import torch
from torch import nn
from torchsummary import summary
from importlib.util import find_spec
if find_spec("text_recognizer") is None:
    import sys
    sys.path.append('..')

In [3]:
from text_recognizer.networks.transformer.nystromer.nystromer import Nystromer

In [14]:
from functools import partial

In [27]:
efficient_transformer = partial(Nystromer,
    dim = 512,
    depth = 12,
    num_heads = 8,
    num_landmarks = 256
)

In [3]:
from text_recognizer.networks.encoders.efficientnet import EfficientNet

In [4]:
en = EfficientNet()

In [9]:
summary(en, (1, 576, 640))

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 256, 18, 20]         --
|    └─ConvNorm: 2-1                     [-1, 32, 288, 320]        --
|    |    └─Sequential: 3-1              [-1, 32, 288, 320]        352
|    └─InvertedResidulaBlock: 2-2        [-1, 16, 288, 320]        --
|    |    └─Sequential: 3-2              [-1, 16, 288, 320]        1,448
|    └─InvertedResidulaBlock: 2-3        [-1, 24, 144, 160]        --
|    |    └─ConvNorm: 3-3                [-1, 96, 288, 320]        14,016
|    |    └─Sequential: 3-4              [-1, 24, 144, 160]        4,276
|    └─InvertedResidulaBlock: 2-4        [-1, 24, 144, 160]        --
|    |    └─ConvNorm: 3-5                [-1, 144, 144, 160]       31,392
|    |    └─Sequential: 3-6              [-1, 24, 144, 160]        6,966
|    └─InvertedResidulaBlock: 2-5        [-1, 40, 72, 80]          --
|    |    └─ConvNorm: 3-7                [-1, 144, 144, 160]       

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 256, 18, 20]         --
|    └─ConvNorm: 2-1                     [-1, 32, 288, 320]        --
|    |    └─Sequential: 3-1              [-1, 32, 288, 320]        352
|    └─InvertedResidulaBlock: 2-2        [-1, 16, 288, 320]        --
|    |    └─Sequential: 3-2              [-1, 16, 288, 320]        1,448
|    └─InvertedResidulaBlock: 2-3        [-1, 24, 144, 160]        --
|    |    └─ConvNorm: 3-3                [-1, 96, 288, 320]        14,016
|    |    └─Sequential: 3-4              [-1, 24, 144, 160]        4,276
|    └─InvertedResidulaBlock: 2-4        [-1, 24, 144, 160]        --
|    |    └─ConvNorm: 3-5                [-1, 144, 144, 160]       31,392
|    |    └─Sequential: 3-6              [-1, 24, 144, 160]        6,966
|    └─InvertedResidulaBlock: 2-5        [-1, 40, 72, 80]          --
|    |    └─ConvNorm: 3-7                [-1, 144, 144, 160]       

In [28]:
type(efficient_transformer)

functools.partial

In [24]:
efficient_transformer = efficient_transformer(num_landmarks=256)

In [29]:
efficient_transformer()

Nystromer(
  (layers): ModuleList(
    (0): ModuleList(
      (0): PreNorm(
        (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fn): NystromAttention(
          (qkv_fn): Linear(in_features=512, out_features=1536, bias=False)
          (fc_out): Sequential(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Dropout(p=0.0, inplace=False)
          )
          (residual): Conv2d(8, 8, kernel_size=(33, 1), stride=(1, 1), padding=(16, 0), groups=8, bias=False)
        )
      )
      (1): PreNorm(
        (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fn): FeedForward(
          (mlp): Sequential(
            (0): GEGLU(
              (fc): Linear(in_features=512, out_features=4096, bias=True)
            )
            (1): Dropout(p=0.0, inplace=False)
            (2): Linear(in_features=2048, out_features=512, bias=True)
          )
        )
      )
    )
    (1): ModuleList(
      (0): PreNorm(
     

In [2]:
from omegaconf import OmegaConf

In [None]:
path = "../training/configs/vqvae.yaml"

In [None]:
conf = OmegaConf.load(path)

In [None]:
print(OmegaConf.to_yaml(conf))

In [None]:
from text_recognizer.networks import VQVAE

In [None]:
vae = VQVAE(**conf.network.args)

In [None]:
vae

In [None]:
datum = torch.randn([2, 1, 576, 640])

In [None]:
vae.encoder(datum)[0].shape

In [None]:
vae(datum)[0].shape

In [None]:
from text_recognizer.networks.backbones.efficientnet import EfficientNet

In [None]:
en = EfficientNet()

In [3]:
datum = torch.randn([2, 1, 576, 640])

In [4]:
trg = torch.randint(0, 1000, [2, 682])

In [None]:
trg.shape

In [None]:
datum = torch.randn([2, 1, 224, 224])

In [None]:
en(datum).shape

In [5]:
path = "../training/configs/cnn_transformer.yaml"

In [6]:
conf = OmegaConf.load(path)

In [7]:
print(OmegaConf.to_yaml(conf))

seed: 4711
network:
  desc: Configuration of the PyTorch neural network.
  type: CNNTransformer
  args:
    encoder:
      type: EfficientNet
      args: null
    num_decoder_layers: 4
    hidden_dim: 256
    num_heads: 4
    expansion_dim: 1024
    dropout_rate: 0.1
    transformer_activation: glu
model:
  desc: Configuration of the PyTorch Lightning model.
  type: LitTransformerModel
  args:
    optimizer:
      type: MADGRAD
      args:
        lr: 0.001
        momentum: 0.9
        weight_decay: 0
        eps: 1.0e-06
    lr_scheduler:
      type: OneCycleLR
      args:
        interval: step
        max_lr: 0.001
        three_phase: true
        epochs: 512
        steps_per_epoch: 1246
    criterion:
      type: CrossEntropyLoss
      args:
        weight: None
        ignore_index: -100
        reduction: mean
    monitor: val_loss
    mapping: sentence_piece
data:
  desc: Configuration of the training/test data.
  type: IAMExtendedParagraphs
  args:
    batch_size: 16
    num

In [8]:
from text_recognizer.networks.cnn_transformer import CNNTransformer

In [9]:
t = CNNTransformer(input_shape=(1, 576, 640), output_shape=(682, 1), **conf.network.args)

In [None]:
t.encode(datum).shape

In [None]:
trg.shape

In [10]:
t(datum, trg).shape

torch.Size([2, 682, 1004])

In [7]:
b, n = 16, 128
device = "cpu"

In [8]:
x = lambda: torch.ones((b, n), device=device).bool()

In [10]:
x().shape

torch.Size([16, 128])

In [12]:
torch.ones((b, n), device=device).bool().shape

torch.Size([16, 128])

In [2]:
x = torch.randn(1, 1, 576, 640)

In [30]:
576 // 32

18

In [31]:
640 // 32

20

In [32]:
18 * 20

360

In [31]:
x = torch.randn(1, 1, 144, 160)

In [4]:
from einops import rearrange

In [5]:
patch_size=16
p = rearrange(x, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size)

In [6]:
p.shape

torch.Size([1, 1440, 256])