In [5]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch.nn.functional as F
import torch
from torch import nn
from torchsummary import summary
from importlib.util import find_spec
if find_spec("text_recognizer") is None:
 import sys
 sys.path.append('..')

from text_recognizer.networks.transformer.vit import ViT
from text_recognizer.networks.transformer.transformer import Transformer
from text_recognizer.networks.transformer.layers import Decoder
from text_recognizer.networks.transformer.nystromer.nystromer import Nystromer

The autoreload extension is already loaded. To reload it, use:
 %reload_ext autoreload


In [6]:
from text_recognizer.networks.encoders.efficientnet.efficientnet import EfficientNet

In [7]:
en = EfficientNet("b0")

In [9]:
def generate_square_subsequent_mask(size: int) -> torch.Tensor:
 """Generate a triangular (size, size) mask."""
 mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
 mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
 return mask

In [10]:
generate_square_subsequent_mask(4)

tensor([[0., -inf, -inf, -inf],
 [0., 0., -inf, -inf],
 [0., 0., 0., -inf],
 [0., 0., 0., 0.]])

In [12]:
from torch import Tensor

In [52]:
tgt = torch.randint(0, 4, (1, 4))
tgt_mask = torch.ones_like(tgt).bool()

In [53]:
tgt_mask

tensor([[True, True, True, True]])

In [13]:
def target_padding_mask(trg: Tensor, pad_index: int) -> Tensor:
 """Returns causal target mask."""
 trg_pad_mask = (trg != pad_index)[:, None, None]
 trg_len = trg.shape[1]
 trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool()
 trg_mask = trg_pad_mask & trg_sub_mask
 return trg_mask

In [54]:
t = torch.randint(0, 6, (0, 4))

In [55]:
t = torch.Tensor([[0, 0, 0, 3, 3, 3]])

In [58]:
tt = t != 3

In [59]:
tt

tensor([[ True, True, True, False, False, False]])

In [43]:
t = torch.cat((t, t))

In [44]:
t.shape

torch.Size([2, 6])

In [45]:
target_padding_mask(t, 3)

tensor([[[[ True, False, False, False, False, False],
 [ True, True, False, False, False, False],
 [ True, True, True, False, False, False],
 [ True, True, True, False, False, False],
 [ True, True, True, False, False, False],
 [ True, True, True, False, False, False]]],


 [[[ True, False, False, False, False, False],
 [ True, True, False, False, False, False],
 [ True, True, True, False, False, False],
 [ True, True, True, False, False, False],
 [ True, True, True, False, False, False],
 [ True, True, True, False, False, False]]]])

In [None]:
target_padding_mask()

In [None]:
summary(en, (1, 224, 224));

In [None]:
torch.cuda.is_available()

In [None]:
decoder = Decoder(dim=128, depth=2, num_heads=8, ff_kwargs={}, attn_kwargs={}, cross_attend=True)

In [None]:
decoder.cuda()

In [None]:
transformer_decoder = Transformer(num_tokens=1003, max_seq_len=451, attn_layers=decoder, emb_dim=128, emb_dropout=0.1)

In [None]:
transformer_decoder.cuda()

In [None]:
efficient_transformer = Nystromer(
 dim = 64,
 depth = 4,
 num_heads = 8,
 num_landmarks = 64
)

In [None]:
v = ViT(
 dim = 64,
 image_size = (576, 640),
 patch_size = (32, 32),
 transformer = efficient_transformer
).cuda()

In [6]:
t = torch.randn(4, 1, 576, 640).cuda()

In [7]:
en.cuda()

EfficientNet(
 (_conv_stem): Sequential(
 (0): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
 (1): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
 (2): BatchNorm2d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
 (3): Mish(inplace=True)
 )
 (_blocks): ModuleList(
 (0): MBConvBlock(
 (_depthwise): Sequential(
 (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32, bias=False)
 (1): BatchNorm2d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
 (2): Mish(inplace=True)
 )
 (_squeeze_excite): Sequential(
 (0): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
 (1): Mish(inplace=True)
 (2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
 )
 (_pointwise): Sequential(
 (0): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
 (1): BatchNorm2d(16, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
 )
 )
 (1): MBConvBlock(
 (_inverted_bottleneck): Sequential(
 (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1

In [8]:
en(t).shape

torch.Size([4, 1280, 18, 20])

In [None]:
o = v(t)

In [None]:
caption = torch.randint(0, 90, (16, 690)).cuda()

In [None]:
o.shape

In [None]:
caption.shape

In [None]:
o = torch.randn(16, 20 * 18, 128).cuda()

In [None]:
caption = torch.randint(0, 1000, (16, 200)).cuda()

In [None]:
transformer_decoder(caption, context = o).shape # (1, 1024, 20000)

In [None]:
efficient_transformer = efficient_transformer(num_landmarks=256)

In [None]:
efficient_transformer()

In [None]:
from omegaconf import OmegaConf

In [None]:
path = "../training/configs/vqvae.yaml"

In [None]:
conf = OmegaConf.load(path)

In [None]:
print(OmegaConf.to_yaml(conf))

In [None]:
from text_recognizer.networks import VQVAE

In [None]:
vae = VQVAE(**conf.network.args)

In [None]:
vae

In [None]:
datum = torch.randn([2, 1, 576, 640])

In [None]:
vae.encoder(datum)[0].shape

In [None]:
vae(datum)[0].shape

In [None]:
datum = torch.randn([2, 1, 576, 640])

In [None]:
trg = torch.randint(0, 1000, [2, 682])

In [None]:
trg.shape

In [None]:
datum = torch.randn([2, 1, 224, 224])

In [None]:
en(t).shape

In [None]:
path = "../training/configs/cnn_transformer.yaml"

In [None]:
conf = OmegaConf.load(path)

In [None]:
print(OmegaConf.to_yaml(conf))

In [None]:
from text_recognizer.networks.cnn_transformer import CNNTransformer

In [None]:
t = CNNTransformer(input_shape=(1, 576, 640), output_shape=(682, 1), **conf.network.args)

In [None]:
t.encode(datum).shape

In [None]:
trg.shape

In [None]:
t(datum, trg).shape

In [None]:
b, n = 16, 128
device = "cpu"

In [None]:
x = lambda: torch.ones((b, n), device=device).bool()

In [None]:
x().shape

In [None]:
torch.ones((b, n), device=device).bool().shape

In [None]:
x = torch.randn(1, 1, 576, 640)

In [None]:
576 // 32

In [None]:
640 // 32

In [None]:
18 * 20

In [None]:
x = torch.randn(1, 1, 144, 160)

In [None]:
from einops import rearrange

In [None]:
patch_size=16
p = rearrange(x, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size)

In [None]:
p.shape