text_recognizer/models/vqgan.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

"""PyTorch Lightning model for base Transformers."""
from typing import Tuple

import attr
from torch import Tensor

from text_recognizer.models.base import BaseLitModel
from text_recognizer.criterions.vqgan_loss import VQGANLoss


@attr.s(auto_attribs=True, eq=False)
class VQGANLitModel(BaseLitModel):
    """A PyTorch Lightning model for transformer networks."""

    loss_fn: VQGANLoss = attr.ib()
    latent_loss_weight: float = attr.ib(default=0.25)

    def forward(self, data: Tensor) -> Tensor:
        """Forward pass with the transformer network."""
        return self.network(data)

    def training_step(
        self, batch: Tuple[Tensor, Tensor], batch_idx: int, optimizer_idx: int
    ) -> Tensor:
        """Training step."""
        data, _ = batch

        reconstructions, commitment_loss = self(data)

        if optimizer_idx == 0:
            loss, log = self.loss_fn(
                data=data,
                reconstructions=reconstructions,
                commitment_loss=commitment_loss,
                decoder_last_layer=self.network.decoder.decoder[-1].weight,
                optimizer_idx=optimizer_idx,
                global_step=self.global_step,
                stage="train",
            )
            self.log(
                "train/loss", loss, prog_bar=True,
            )
            self.log_dict(log, logger=True, on_step=True, on_epoch=True)
            return loss

        if optimizer_idx == 1:
            loss, log = self.loss_fn(
                data=data,
                reconstructions=reconstructions,
                commitment_loss=commitment_loss,
                decoder_last_layer=self.network.decoder.decoder[-1].weight,
                optimizer_idx=optimizer_idx,
                global_step=self.global_step,
                stage="train",
            )
            self.log(
                "train/discriminator_loss", loss, prog_bar=True,
            )
            self.log_dict(log, logger=True, on_step=True, on_epoch=True)
            return loss

    def validation_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
        """Validation step."""
        data, _ = batch
        reconstructions, commitment_loss = self(data)

        loss, log = self.loss_fn(
            data=data,
            reconstructions=reconstructions,
            commitment_loss=commitment_loss,
            decoder_last_layer=self.network.decoder.decoder[-1].weight,
            optimizer_idx=0,
            global_step=self.global_step,
            stage="val",
        )
        self.log(
            "val/loss", loss, prog_bar=True,
        )
        self.log_dict(log)

        _, log = self.loss_fn(
            data=data,
            reconstructions=reconstructions,
            commitment_loss=commitment_loss,
            decoder_last_layer=self.network.decoder.decoder[-1].weight,
            optimizer_idx=1,
            global_step=self.global_step,
            stage="val",
        )
        self.log_dict(log)

    def test_step(self, batch: Tuple[Tensor, Tensor], batch_idx: int) -> None:
        """Test step."""
        data, _ = batch
        reconstructions, commitment_loss = self(data)

        _, log = self.loss_fn(
            data=data,
            reconstructions=reconstructions,
            commitment_loss=commitment_loss,
            decoder_last_layer=self.network.decoder.decoder[-1].weight,
            optimizer_idx=0,
            global_step=self.global_step,
            stage="test",
        )
        self.log_dict(log)

        _, log = self.loss_fn(
            data=data,
            reconstructions=reconstructions,
            commitment_loss=commitment_loss,
            decoder_last_layer=self.network.decoder.decoder[-1].weight,
            optimizer_idx=1,
            global_step=self.global_step,
            stage="test",
        )
        self.log_dict(log)