From 1d0977585f01c42e9f6280559a1a98037907a62e Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Sun, 2 May 2021 13:51:15 +0200 Subject: Implemented training script with hydra --- poetry.lock | 52 ++- pyproject.toml | 3 +- text_recognizer/data/iam_synthetic_paragraphs.py | 2 +- text_recognizer/networks/__init__.py | 3 +- text_recognizer/networks/cnn_transformer.py | 364 +++++++-------- text_recognizer/networks/transformer/__init__.py | 3 +- .../networks/transformer/transformer.py | 520 ++++++++++----------- training/.gitignore | 1 + training/conf/callbacks/default.yaml | 14 + training/conf/callbacks/swa.yaml | 16 + training/conf/cnn_transformer.yaml | 90 ++++ training/conf/config.yaml | 6 + training/conf/dataset/iam_extended_paragraphs.yaml | 7 + training/conf/model/lit_vqvae.yaml | 24 + training/conf/network/vqvae.yaml | 14 + training/conf/trainer/default.yaml | 18 + training/configs/cnn_transformer.yaml | 90 ---- training/configs/vqvae.yaml | 89 ---- training/run_experiment.py | 136 ++---- 19 files changed, 725 insertions(+), 727 deletions(-) create mode 100644 training/conf/callbacks/default.yaml create mode 100644 training/conf/callbacks/swa.yaml create mode 100644 training/conf/cnn_transformer.yaml create mode 100644 training/conf/config.yaml create mode 100644 training/conf/dataset/iam_extended_paragraphs.yaml create mode 100644 training/conf/model/lit_vqvae.yaml create mode 100644 training/conf/network/vqvae.yaml create mode 100644 training/conf/trainer/default.yaml delete mode 100644 training/configs/cnn_transformer.yaml delete mode 100644 training/configs/vqvae.yaml diff --git a/poetry.lock b/poetry.lock index a8034e7..81ccaed 100644 --- a/poetry.lock +++ b/poetry.lock @@ -36,6 +36,14 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "antlr4-python3-runtime" +version = "4.8" +description = "ANTLR 4.8 runtime for Python 3.7" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "appdirs" version = "1.4.4" @@ -621,6 +629,19 @@ numpy = [ {version = ">=1.19.3", markers = "python_version >= \"3.9\""}, ] +[[package]] +name = "hydra-core" +version = "1.0.6" +description = "A framework for elegantly configuring complex applications" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +antlr4-python3-runtime = "4.8" +importlib-resources = {version = "*", markers = "python_version < \"3.9\""} +omegaconf = ">=2.0.5,<2.1" + [[package]] name = "idna" version = "2.10" @@ -637,6 +658,18 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "importlib-resources" +version = "5.1.2" +description = "Read resources from Python packages" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] +testing = ["pytest (>=4.6)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "pytest-black (>=0.3.7)", "pytest-mypy"] + [[package]] name = "ipykernel" version = "5.5.3" @@ -2086,7 +2119,7 @@ brotli = ["brotlipy (>=0.6.0)"] [[package]] name = "wandb" -version = "0.10.27" +version = "0.10.28" description = "A CLI and library for interacting with the Weights and Biases API." category = "main" optional = false @@ -2197,7 +2230,7 @@ multidict = ">=4.0" [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "07d5f14d7c55a961ce1841ecd125c0f7c83d5649cc118fcae2ed3b58347ca8c2" +content-hash = "bcc456879df9e9ec937f1aa8c339fe96ca18ce95b12b12fc10c23494829296e8" [metadata.files] absl-py = [ @@ -2247,6 +2280,9 @@ alabaster = [ {file = "alabaster-0.7.12-py2.py3-none-any.whl", hash = "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359"}, {file = "alabaster-0.7.12.tar.gz", hash = "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"}, ] +antlr4-python3-runtime = [ + {file = "antlr4-python3-runtime-4.8.tar.gz", hash = "sha256:15793f5d0512a372b4e7d2284058ad32ce7dd27126b105fb0b2245130445db33"}, +] appdirs = [ {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, @@ -2628,6 +2664,10 @@ h5py = [ {file = "h5py-3.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:7c5b5f18c96fb63399280a724734fd91e1781c6b60e385e439ad8e654a294ba4"}, {file = "h5py-3.2.1.tar.gz", hash = "sha256:89474be911bfcdb34cbf0d98b8ec48b578c27a89fdb1ae4ee7513f1ef8d9249e"}, ] +hydra-core = [ + {file = "hydra-core-1.0.6.tar.gz", hash = "sha256:be5fe119eb41ada20c82835b153b956781de7e2506670ba942f7453b2e850950"}, + {file = "hydra_core-1.0.6-py3-none-any.whl", hash = "sha256:500d4346b7afcd654276c87c15820d7e6b76c2da95ad698cceb4120d7a877b32"}, +] idna = [ {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, @@ -2636,6 +2676,10 @@ imagesize = [ {file = "imagesize-1.2.0-py2.py3-none-any.whl", hash = "sha256:6965f19a6a2039c7d48bca7dba2473069ff854c36ae6f19d2cde309d998228a1"}, {file = "imagesize-1.2.0.tar.gz", hash = "sha256:b1f6b5a4eab1f73479a50fb79fcf729514a900c341d8503d62a62dbc4127a2b1"}, ] +importlib-resources = [ + {file = "importlib_resources-5.1.2-py3-none-any.whl", hash = "sha256:ebab3efe74d83b04d6bf5cd9a17f0c5c93e60fb60f30c90f56265fce4682a469"}, + {file = "importlib_resources-5.1.2.tar.gz", hash = "sha256:642586fc4740bd1cad7690f836b3321309402b20b332529f25617ff18e8e1370"}, +] ipykernel = [ {file = "ipykernel-5.5.3-py3-none-any.whl", hash = "sha256:21abd584543759e49010975a4621603b3cf871b1039cb3879a14094717692614"}, {file = "ipykernel-5.5.3.tar.gz", hash = "sha256:a682e4f7affd86d9ce9b699d21bcab6d5ec9fbb2bfcb194f2706973b252bc509"}, @@ -3635,8 +3679,8 @@ urllib3 = [ {file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"}, ] wandb = [ - {file = "wandb-0.10.27-py2.py3-none-any.whl", hash = "sha256:f591bb9d5c402ec5c12bd823db913d49ac31e4068f2c35c50b6f13e4fcd717b4"}, - {file = "wandb-0.10.27.tar.gz", hash = "sha256:1b6d3bbfd644183bbd79a02ff9e57e65a8a4f7c9b770af0d3c4f961ae6d7fc73"}, + {file = "wandb-0.10.28-py2.py3-none-any.whl", hash = "sha256:609f2605ec82f846490a2bdd9d01fa947b1892a76133cbb80c41501e5dfda763"}, + {file = "wandb-0.10.28.tar.gz", hash = "sha256:b48aa55f147717e197d38b0dd9d9ef3662efe54fefc0be2909f93e8a5c0cc71b"}, ] wcwidth = [ {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, diff --git a/pyproject.toml b/pyproject.toml index f9bce95..6f050c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ nltk = "^3.5" torch-summary = "^1.4.2" defusedxml = "^0.6.0" omegaconf = "^2.0.2" -wandb = "^0.10.27" +wandb = "^0.10.28" einops = "^0.3.0" gtn = "^0.0.0" sentencepiece = "^0.1.95" @@ -41,6 +41,7 @@ Pillow = "^8.1.2" madgrad = "^1.0" editdistance = "^0.5.3" torchmetrics = "^0.2.0" +hydra-core = "^1.0.6" [tool.poetry.dev-dependencies] pytest = "^5.4.2" diff --git a/text_recognizer/data/iam_synthetic_paragraphs.py b/text_recognizer/data/iam_synthetic_paragraphs.py index 78e6c05..ad6fa25 100644 --- a/text_recognizer/data/iam_synthetic_paragraphs.py +++ b/text_recognizer/data/iam_synthetic_paragraphs.py @@ -76,7 +76,7 @@ class IAMSyntheticParagraphs(IAMParagraphs): def setup(self, stage: str = None) -> None: """Loading synthetic dataset.""" - logger.info(f"IAM Synthetic dataset steup for stage {stage}") + logger.info(f"IAM Synthetic dataset steup for stage {stage}...") if stage == "fit" or stage is None: line_crops, line_labels = load_line_crops_and_labels( diff --git a/text_recognizer/networks/__init__.py b/text_recognizer/networks/__init__.py index a9117f8..d1ebf1a 100644 --- a/text_recognizer/networks/__init__.py +++ b/text_recognizer/networks/__init__.py @@ -1,4 +1,5 @@ """Network modules""" from .encoders import EfficientNet from .vqvae import VQVAE -from .cnn_transformer import CNNTransformer + +# from .cnn_transformer import CNNTransformer diff --git a/text_recognizer/networks/cnn_transformer.py b/text_recognizer/networks/cnn_transformer.py index d42c29d..80798e1 100644 --- a/text_recognizer/networks/cnn_transformer.py +++ b/text_recognizer/networks/cnn_transformer.py @@ -1,182 +1,182 @@ -"""A Transformer with a cnn backbone. - -The network encodes a image with a convolutional backbone to a latent representation, -i.e. feature maps. A 2d positional encoding is applied to the feature maps for -spatial information. The resulting feature are then set to a transformer decoder -together with the target tokens. - -TODO: Local attention for lower layer in attention. - -""" -import importlib -import math -from typing import Dict, Optional, Union, Sequence, Type - -from einops import rearrange -from omegaconf import DictConfig, OmegaConf -import torch -from torch import nn -from torch import Tensor - -from text_recognizer.data.emnist import NUM_SPECIAL_TOKENS -from text_recognizer.networks.transformer import ( - Decoder, - DecoderLayer, - PositionalEncoding, - PositionalEncoding2D, - target_padding_mask, -) - -NUM_WORD_PIECES = 1000 - - -class CNNTransformer(nn.Module): - def __init__( - self, - input_dim: Sequence[int], - output_dims: Sequence[int], - encoder: Union[DictConfig, Dict], - vocab_size: Optional[int] = None, - num_decoder_layers: int = 4, - hidden_dim: int = 256, - num_heads: int = 4, - expansion_dim: int = 1024, - dropout_rate: float = 0.1, - transformer_activation: str = "glu", - *args, - **kwargs, - ) -> None: - super().__init__() - self.vocab_size = ( - NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size - ) - self.pad_index = 3 # TODO: fix me - self.hidden_dim = hidden_dim - self.max_output_length = output_dims[0] - - # Image backbone - self.encoder = self._configure_encoder(encoder) - self.encoder_proj = nn.Conv2d(256, hidden_dim, kernel_size=1) - self.feature_map_encoding = PositionalEncoding2D( - hidden_dim=hidden_dim, max_h=input_dim[1], max_w=input_dim[2] - ) - - # Target token embedding - self.trg_embedding = nn.Embedding(self.vocab_size, hidden_dim) - self.trg_position_encoding = PositionalEncoding( - hidden_dim, dropout_rate, max_len=output_dims[0] - ) - - # Transformer decoder - self.decoder = Decoder( - decoder_layer=DecoderLayer( - hidden_dim=hidden_dim, - num_heads=num_heads, - expansion_dim=expansion_dim, - dropout_rate=dropout_rate, - activation=transformer_activation, - ), - num_layers=num_decoder_layers, - norm=nn.LayerNorm(hidden_dim), - ) - - # Classification head - self.head = nn.Linear(hidden_dim, self.vocab_size) - - # Initialize weights - self._init_weights() - - def _init_weights(self) -> None: - """Initialize network weights.""" - self.trg_embedding.weight.data.uniform_(-0.1, 0.1) - self.head.bias.data.zero_() - self.head.weight.data.uniform_(-0.1, 0.1) - - nn.init.kaiming_normal_( - self.encoder_proj.weight.data, - a=0, - mode="fan_out", - nonlinearity="relu", - ) - if self.encoder_proj.bias is not None: - _, fan_out = nn.init._calculate_fan_in_and_fan_out( - self.encoder_proj.weight.data - ) - bound = 1 / math.sqrt(fan_out) - nn.init.normal_(self.encoder_proj.bias, -bound, bound) - - @staticmethod - def _configure_encoder(encoder: Union[DictConfig, Dict]) -> Type[nn.Module]: - encoder = OmegaConf.create(encoder) - args = encoder.args or {} - network_module = importlib.import_module("text_recognizer.networks") - encoder_class = getattr(network_module, encoder.type) - return encoder_class(**args) - - def encode(self, image: Tensor) -> Tensor: - """Extracts image features with backbone. - - Args: - image (Tensor): Image(s) of handwritten text. - - Retuns: - Tensor: Image features. - - Shapes: - - image: :math: `(B, C, H, W)` - - latent: :math: `(B, T, C)` - - """ - # Extract image features. - image_features = self.encoder(image) - image_features = self.encoder_proj(image_features) - - # Add 2d encoding to the feature maps. - image_features = self.feature_map_encoding(image_features) - - # Collapse features maps height and width. - image_features = rearrange(image_features, "b c h w -> b (h w) c") - return image_features - - def decode(self, memory: Tensor, trg: Tensor) -> Tensor: - """Decodes image features with transformer decoder.""" - trg_mask = target_padding_mask(trg=trg, pad_index=self.pad_index) - trg = self.trg_embedding(trg) * math.sqrt(self.hidden_dim) - trg = rearrange(trg, "b t d -> t b d") - trg = self.trg_position_encoding(trg) - trg = rearrange(trg, "t b d -> b t d") - out = self.decoder(trg=trg, memory=memory, trg_mask=trg_mask, memory_mask=None) - logits = self.head(out) - return logits - - def forward(self, image: Tensor, trg: Tensor) -> Tensor: - image_features = self.encode(image) - output = self.decode(image_features, trg) - output = rearrange(output, "b t c -> b c t") - return output - - def predict(self, image: Tensor) -> Tensor: - """Transcribes text in image(s).""" - bsz = image.shape[0] - image_features = self.encode(image) - - output_tokens = ( - (torch.ones((bsz, self.max_output_length)) * self.pad_index) - .type_as(image) - .long() - ) - output_tokens[:, 0] = self.start_index - for i in range(1, self.max_output_length): - trg = output_tokens[:, :i] - output = self.decode(image_features, trg) - output = torch.argmax(output, dim=-1) - output_tokens[:, i] = output[-1:] - - # Set all tokens after end token to be padding. - for i in range(1, self.max_output_length): - indices = output_tokens[:, i - 1] == self.end_index | ( - output_tokens[:, i - 1] == self.pad_index - ) - output_tokens[indices, i] = self.pad_index - - return output_tokens +# """A Transformer with a cnn backbone. +# +# The network encodes a image with a convolutional backbone to a latent representation, +# i.e. feature maps. A 2d positional encoding is applied to the feature maps for +# spatial information. The resulting feature are then set to a transformer decoder +# together with the target tokens. +# +# TODO: Local attention for lower layer in attention. +# +# """ +# import importlib +# import math +# from typing import Dict, Optional, Union, Sequence, Type +# +# from einops import rearrange +# from omegaconf import DictConfig, OmegaConf +# import torch +# from torch import nn +# from torch import Tensor +# +# from text_recognizer.data.emnist import NUM_SPECIAL_TOKENS +# from text_recognizer.networks.transformer import ( +# Decoder, +# DecoderLayer, +# PositionalEncoding, +# PositionalEncoding2D, +# target_padding_mask, +# ) +# +# NUM_WORD_PIECES = 1000 +# +# +# class CNNTransformer(nn.Module): +# def __init__( +# self, +# input_dim: Sequence[int], +# output_dims: Sequence[int], +# encoder: Union[DictConfig, Dict], +# vocab_size: Optional[int] = None, +# num_decoder_layers: int = 4, +# hidden_dim: int = 256, +# num_heads: int = 4, +# expansion_dim: int = 1024, +# dropout_rate: float = 0.1, +# transformer_activation: str = "glu", +# *args, +# **kwargs, +# ) -> None: +# super().__init__() +# self.vocab_size = ( +# NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size +# ) +# self.pad_index = 3 # TODO: fix me +# self.hidden_dim = hidden_dim +# self.max_output_length = output_dims[0] +# +# # Image backbone +# self.encoder = self._configure_encoder(encoder) +# self.encoder_proj = nn.Conv2d(256, hidden_dim, kernel_size=1) +# self.feature_map_encoding = PositionalEncoding2D( +# hidden_dim=hidden_dim, max_h=input_dim[1], max_w=input_dim[2] +# ) +# +# # Target token embedding +# self.trg_embedding = nn.Embedding(self.vocab_size, hidden_dim) +# self.trg_position_encoding = PositionalEncoding( +# hidden_dim, dropout_rate, max_len=output_dims[0] +# ) +# +# # Transformer decoder +# self.decoder = Decoder( +# decoder_layer=DecoderLayer( +# hidden_dim=hidden_dim, +# num_heads=num_heads, +# expansion_dim=expansion_dim, +# dropout_rate=dropout_rate, +# activation=transformer_activation, +# ), +# num_layers=num_decoder_layers, +# norm=nn.LayerNorm(hidden_dim), +# ) +# +# # Classification head +# self.head = nn.Linear(hidden_dim, self.vocab_size) +# +# # Initialize weights +# self._init_weights() +# +# def _init_weights(self) -> None: +# """Initialize network weights.""" +# self.trg_embedding.weight.data.uniform_(-0.1, 0.1) +# self.head.bias.data.zero_() +# self.head.weight.data.uniform_(-0.1, 0.1) +# +# nn.init.kaiming_normal_( +# self.encoder_proj.weight.data, +# a=0, +# mode="fan_out", +# nonlinearity="relu", +# ) +# if self.encoder_proj.bias is not None: +# _, fan_out = nn.init._calculate_fan_in_and_fan_out( +# self.encoder_proj.weight.data +# ) +# bound = 1 / math.sqrt(fan_out) +# nn.init.normal_(self.encoder_proj.bias, -bound, bound) +# +# @staticmethod +# def _configure_encoder(encoder: Union[DictConfig, Dict]) -> Type[nn.Module]: +# encoder = OmegaConf.create(encoder) +# args = encoder.args or {} +# network_module = importlib.import_module("text_recognizer.networks") +# encoder_class = getattr(network_module, encoder.type) +# return encoder_class(**args) +# +# def encode(self, image: Tensor) -> Tensor: +# """Extracts image features with backbone. +# +# Args: +# image (Tensor): Image(s) of handwritten text. +# +# Retuns: +# Tensor: Image features. +# +# Shapes: +# - image: :math: `(B, C, H, W)` +# - latent: :math: `(B, T, C)` +# +# """ +# # Extract image features. +# image_features = self.encoder(image) +# image_features = self.encoder_proj(image_features) +# +# # Add 2d encoding to the feature maps. +# image_features = self.feature_map_encoding(image_features) +# +# # Collapse features maps height and width. +# image_features = rearrange(image_features, "b c h w -> b (h w) c") +# return image_features +# +# def decode(self, memory: Tensor, trg: Tensor) -> Tensor: +# """Decodes image features with transformer decoder.""" +# trg_mask = target_padding_mask(trg=trg, pad_index=self.pad_index) +# trg = self.trg_embedding(trg) * math.sqrt(self.hidden_dim) +# trg = rearrange(trg, "b t d -> t b d") +# trg = self.trg_position_encoding(trg) +# trg = rearrange(trg, "t b d -> b t d") +# out = self.decoder(trg=trg, memory=memory, trg_mask=trg_mask, memory_mask=None) +# logits = self.head(out) +# return logits +# +# def forward(self, image: Tensor, trg: Tensor) -> Tensor: +# image_features = self.encode(image) +# output = self.decode(image_features, trg) +# output = rearrange(output, "b t c -> b c t") +# return output +# +# def predict(self, image: Tensor) -> Tensor: +# """Transcribes text in image(s).""" +# bsz = image.shape[0] +# image_features = self.encode(image) +# +# output_tokens = ( +# (torch.ones((bsz, self.max_output_length)) * self.pad_index) +# .type_as(image) +# .long() +# ) +# output_tokens[:, 0] = self.start_index +# for i in range(1, self.max_output_length): +# trg = output_tokens[:, :i] +# output = self.decode(image_features, trg) +# output = torch.argmax(output, dim=-1) +# output_tokens[:, i] = output[-1:] +# +# # Set all tokens after end token to be padding. +# for i in range(1, self.max_output_length): +# indices = output_tokens[:, i - 1] == self.end_index | ( +# output_tokens[:, i - 1] == self.pad_index +# ) +# output_tokens[indices, i] = self.pad_index +# +# return output_tokens diff --git a/text_recognizer/networks/transformer/__init__.py b/text_recognizer/networks/transformer/__init__.py index 627fa7b..4ff48f7 100644 --- a/text_recognizer/networks/transformer/__init__.py +++ b/text_recognizer/networks/transformer/__init__.py @@ -4,4 +4,5 @@ from .positional_encoding import ( PositionalEncoding2D, target_padding_mask, ) -from .transformer import Decoder, DecoderLayer, Encoder, EncoderLayer, Transformer + +# from .transformer import Decoder, DecoderLayer, Encoder, EncoderLayer, Transformer diff --git a/text_recognizer/networks/transformer/transformer.py b/text_recognizer/networks/transformer/transformer.py index 5ac2787..d49c85a 100644 --- a/text_recognizer/networks/transformer/transformer.py +++ b/text_recognizer/networks/transformer/transformer.py @@ -1,260 +1,260 @@ -"""Transfomer module.""" -import copy -from typing import Dict, Optional, Type, Union - -import numpy as np -import torch -from torch import nn -from torch import Tensor -import torch.nn.functional as F - -from text_recognizer.networks.transformer.attention import MultiHeadAttention -from text_recognizer.networks.util import activation_function - - -class GEGLU(nn.Module): - """GLU activation for improving feedforward activations.""" - - def __init__(self, dim_in: int, dim_out: int) -> None: - super().__init__() - self.proj = nn.Linear(dim_in, dim_out * 2) - - def forward(self, x: Tensor) -> Tensor: - """Forward propagation.""" - x, gate = self.proj(x).chunk(2, dim=-1) - return x * F.gelu(gate) - - -def _get_clones(module: Type[nn.Module], num_layers: int) -> nn.ModuleList: - return nn.ModuleList([copy.deepcopy(module) for _ in range(num_layers)]) - - -class _IntraLayerConnection(nn.Module): - """Preforms the residual connection inside the transfomer blocks and applies layernorm.""" - - def __init__(self, dropout_rate: float, hidden_dim: int) -> None: - super().__init__() - self.norm = nn.LayerNorm(normalized_shape=hidden_dim) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward(self, src: Tensor, residual: Tensor) -> Tensor: - return self.norm(self.dropout(src) + residual) - - -class FeedForward(nn.Module): - def __init__( - self, - hidden_dim: int, - expansion_dim: int, - dropout_rate: float, - activation: str = "relu", - ) -> None: - super().__init__() - - in_projection = ( - nn.Sequential( - nn.Linear(hidden_dim, expansion_dim), activation_function(activation) - ) - if activation != "glu" - else GEGLU(hidden_dim, expansion_dim) - ) - - self.layer = nn.Sequential( - in_projection, - nn.Dropout(p=dropout_rate), - nn.Linear(in_features=expansion_dim, out_features=hidden_dim), - ) - - def forward(self, x: Tensor) -> Tensor: - return self.layer(x) - - -class EncoderLayer(nn.Module): - """Transfomer encoding layer.""" - - def __init__( - self, - hidden_dim: int, - num_heads: int, - expansion_dim: int, - dropout_rate: float, - activation: str = "relu", - ) -> None: - super().__init__() - self.self_attention = MultiHeadAttention(hidden_dim, num_heads, dropout_rate) - self.mlp = FeedForward(hidden_dim, expansion_dim, dropout_rate, activation) - self.block1 = _IntraLayerConnection(dropout_rate, hidden_dim) - self.block2 = _IntraLayerConnection(dropout_rate, hidden_dim) - - def forward(self, src: Tensor, mask: Optional[Tensor] = None) -> Tensor: - """Forward pass through the encoder.""" - # First block. - # Multi head attention. - out, _ = self.self_attention(src, src, src, mask) - - # Add & norm. - out = self.block1(out, src) - - # Second block. - # Apply 1D-convolution. - mlp_out = self.mlp(out) - - # Add & norm. - out = self.block2(mlp_out, out) - - return out - - -class Encoder(nn.Module): - """Transfomer encoder module.""" - - def __init__( - self, - num_layers: int, - encoder_layer: Type[nn.Module], - norm: Optional[Type[nn.Module]] = None, - ) -> None: - super().__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.norm = norm - - def forward(self, src: Tensor, src_mask: Optional[Tensor] = None) -> Tensor: - """Forward pass through all encoder layers.""" - for layer in self.layers: - src = layer(src, src_mask) - - if self.norm is not None: - src = self.norm(src) - - return src - - -class DecoderLayer(nn.Module): - """Transfomer decoder layer.""" - - def __init__( - self, - hidden_dim: int, - num_heads: int, - expansion_dim: int, - dropout_rate: float = 0.0, - activation: str = "relu", - ) -> None: - super().__init__() - self.hidden_dim = hidden_dim - self.self_attention = MultiHeadAttention(hidden_dim, num_heads, dropout_rate) - self.multihead_attention = MultiHeadAttention( - hidden_dim, num_heads, dropout_rate - ) - self.mlp = FeedForward(hidden_dim, expansion_dim, dropout_rate, activation) - self.block1 = _IntraLayerConnection(dropout_rate, hidden_dim) - self.block2 = _IntraLayerConnection(dropout_rate, hidden_dim) - self.block3 = _IntraLayerConnection(dropout_rate, hidden_dim) - - def forward( - self, - trg: Tensor, - memory: Tensor, - trg_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - ) -> Tensor: - """Forward pass of the layer.""" - out, _ = self.self_attention(trg, trg, trg, trg_mask) - trg = self.block1(out, trg) - - out, _ = self.multihead_attention(trg, memory, memory, memory_mask) - trg = self.block2(out, trg) - - out = self.mlp(trg) - out = self.block3(out, trg) - - return out - - -class Decoder(nn.Module): - """Transfomer decoder module.""" - - def __init__( - self, - decoder_layer: Type[nn.Module], - num_layers: int, - norm: Optional[Type[nn.Module]] = None, - ) -> None: - super().__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward( - self, - trg: Tensor, - memory: Tensor, - trg_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - ) -> Tensor: - """Forward pass through the decoder.""" - for layer in self.layers: - trg = layer(trg, memory, trg_mask, memory_mask) - - if self.norm is not None: - trg = self.norm(trg) - - return trg - - -class Transformer(nn.Module): - """Transformer network.""" - - def __init__( - self, - num_encoder_layers: int, - num_decoder_layers: int, - hidden_dim: int, - num_heads: int, - expansion_dim: int, - dropout_rate: float, - activation: str = "relu", - ) -> None: - super().__init__() - - # Configure encoder. - encoder_norm = nn.LayerNorm(hidden_dim) - encoder_layer = EncoderLayer( - hidden_dim, num_heads, expansion_dim, dropout_rate, activation - ) - self.encoder = Encoder(num_encoder_layers, encoder_layer, encoder_norm) - - # Configure decoder. - decoder_norm = nn.LayerNorm(hidden_dim) - decoder_layer = DecoderLayer( - hidden_dim, num_heads, expansion_dim, dropout_rate, activation - ) - self.decoder = Decoder(decoder_layer, num_decoder_layers, decoder_norm) - - self._reset_parameters() - - def _reset_parameters(self) -> None: - for p in self.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - - def forward( - self, - src: Tensor, - trg: Tensor, - src_mask: Optional[Tensor] = None, - trg_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - ) -> Tensor: - """Forward pass through the transformer.""" - if src.shape[0] != trg.shape[0]: - print(trg.shape) - raise RuntimeError("The batch size of the src and trg must be the same.") - if src.shape[2] != trg.shape[2]: - raise RuntimeError( - "The number of features for the src and trg must be the same." - ) - - memory = self.encoder(src, src_mask) - output = self.decoder(trg, memory, trg_mask, memory_mask) - return output +# """Transfomer module.""" +# import copy +# from typing import Dict, Optional, Type, Union +# +# import numpy as np +# import torch +# from torch import nn +# from torch import Tensor +# import torch.nn.functional as F +# +# from text_recognizer.networks.transformer.attention import MultiHeadAttention +# from text_recognizer.networks.util import activation_function +# +# +# class GEGLU(nn.Module): +# """GLU activation for improving feedforward activations.""" +# +# def __init__(self, dim_in: int, dim_out: int) -> None: +# super().__init__() +# self.proj = nn.Linear(dim_in, dim_out * 2) +# +# def forward(self, x: Tensor) -> Tensor: +# """Forward propagation.""" +# x, gate = self.proj(x).chunk(2, dim=-1) +# return x * F.gelu(gate) +# +# +# def _get_clones(module: Type[nn.Module], num_layers: int) -> nn.ModuleList: +# return nn.ModuleList([copy.deepcopy(module) for _ in range(num_layers)]) +# +# +# class _IntraLayerConnection(nn.Module): +# """Preforms the residual connection inside the transfomer blocks and applies layernorm.""" +# +# def __init__(self, dropout_rate: float, hidden_dim: int) -> None: +# super().__init__() +# self.norm = nn.LayerNorm(normalized_shape=hidden_dim) +# self.dropout = nn.Dropout(p=dropout_rate) +# +# def forward(self, src: Tensor, residual: Tensor) -> Tensor: +# return self.norm(self.dropout(src) + residual) +# +# +# class FeedForward(nn.Module): +# def __init__( +# self, +# hidden_dim: int, +# expansion_dim: int, +# dropout_rate: float, +# activation: str = "relu", +# ) -> None: +# super().__init__() +# +# in_projection = ( +# nn.Sequential( +# nn.Linear(hidden_dim, expansion_dim), activation_function(activation) +# ) +# if activation != "glu" +# else GEGLU(hidden_dim, expansion_dim) +# ) +# +# self.layer = nn.Sequential( +# in_projection, +# nn.Dropout(p=dropout_rate), +# nn.Linear(in_features=expansion_dim, out_features=hidden_dim), +# ) +# +# def forward(self, x: Tensor) -> Tensor: +# return self.layer(x) +# +# +# class EncoderLayer(nn.Module): +# """Transfomer encoding layer.""" +# +# def __init__( +# self, +# hidden_dim: int, +# num_heads: int, +# expansion_dim: int, +# dropout_rate: float, +# activation: str = "relu", +# ) -> None: +# super().__init__() +# self.self_attention = MultiHeadAttention(hidden_dim, num_heads, dropout_rate) +# self.mlp = FeedForward(hidden_dim, expansion_dim, dropout_rate, activation) +# self.block1 = _IntraLayerConnection(dropout_rate, hidden_dim) +# self.block2 = _IntraLayerConnection(dropout_rate, hidden_dim) +# +# def forward(self, src: Tensor, mask: Optional[Tensor] = None) -> Tensor: +# """Forward pass through the encoder.""" +# # First block. +# # Multi head attention. +# out, _ = self.self_attention(src, src, src, mask) +# +# # Add & norm. +# out = self.block1(out, src) +# +# # Second block. +# # Apply 1D-convolution. +# mlp_out = self.mlp(out) +# +# # Add & norm. +# out = self.block2(mlp_out, out) +# +# return out +# +# +# class Encoder(nn.Module): +# """Transfomer encoder module.""" +# +# def __init__( +# self, +# num_layers: int, +# encoder_layer: Type[nn.Module], +# norm: Optional[Type[nn.Module]] = None, +# ) -> None: +# super().__init__() +# self.layers = _get_clones(encoder_layer, num_layers) +# self.norm = norm +# +# def forward(self, src: Tensor, src_mask: Optional[Tensor] = None) -> Tensor: +# """Forward pass through all encoder layers.""" +# for layer in self.layers: +# src = layer(src, src_mask) +# +# if self.norm is not None: +# src = self.norm(src) +# +# return src +# +# +# class DecoderLayer(nn.Module): +# """Transfomer decoder layer.""" +# +# def __init__( +# self, +# hidden_dim: int, +# num_heads: int, +# expansion_dim: int, +# dropout_rate: float = 0.0, +# activation: str = "relu", +# ) -> None: +# super().__init__() +# self.hidden_dim = hidden_dim +# self.self_attention = MultiHeadAttention(hidden_dim, num_heads, dropout_rate) +# self.multihead_attention = MultiHeadAttention( +# hidden_dim, num_heads, dropout_rate +# ) +# self.mlp = FeedForward(hidden_dim, expansion_dim, dropout_rate, activation) +# self.block1 = _IntraLayerConnection(dropout_rate, hidden_dim) +# self.block2 = _IntraLayerConnection(dropout_rate, hidden_dim) +# self.block3 = _IntraLayerConnection(dropout_rate, hidden_dim) +# +# def forward( +# self, +# trg: Tensor, +# memory: Tensor, +# trg_mask: Optional[Tensor] = None, +# memory_mask: Optional[Tensor] = None, +# ) -> Tensor: +# """Forward pass of the layer.""" +# out, _ = self.self_attention(trg, trg, trg, trg_mask) +# trg = self.block1(out, trg) +# +# out, _ = self.multihead_attention(trg, memory, memory, memory_mask) +# trg = self.block2(out, trg) +# +# out = self.mlp(trg) +# out = self.block3(out, trg) +# +# return out +# +# +# class Decoder(nn.Module): +# """Transfomer decoder module.""" +# +# def __init__( +# self, +# decoder_layer: Type[nn.Module], +# num_layers: int, +# norm: Optional[Type[nn.Module]] = None, +# ) -> None: +# super().__init__() +# self.layers = _get_clones(decoder_layer, num_layers) +# self.num_layers = num_layers +# self.norm = norm +# +# def forward( +# self, +# trg: Tensor, +# memory: Tensor, +# trg_mask: Optional[Tensor] = None, +# memory_mask: Optional[Tensor] = None, +# ) -> Tensor: +# """Forward pass through the decoder.""" +# for layer in self.layers: +# trg = layer(trg, memory, trg_mask, memory_mask) +# +# if self.norm is not None: +# trg = self.norm(trg) +# +# return trg +# +# +# class Transformer(nn.Module): +# """Transformer network.""" +# +# def __init__( +# self, +# num_encoder_layers: int, +# num_decoder_layers: int, +# hidden_dim: int, +# num_heads: int, +# expansion_dim: int, +# dropout_rate: float, +# activation: str = "relu", +# ) -> None: +# super().__init__() +# +# # Configure encoder. +# encoder_norm = nn.LayerNorm(hidden_dim) +# encoder_layer = EncoderLayer( +# hidden_dim, num_heads, expansion_dim, dropout_rate, activation +# ) +# self.encoder = Encoder(num_encoder_layers, encoder_layer, encoder_norm) +# +# # Configure decoder. +# decoder_norm = nn.LayerNorm(hidden_dim) +# decoder_layer = DecoderLayer( +# hidden_dim, num_heads, expansion_dim, dropout_rate, activation +# ) +# self.decoder = Decoder(decoder_layer, num_decoder_layers, decoder_norm) +# +# self._reset_parameters() +# +# def _reset_parameters(self) -> None: +# for p in self.parameters(): +# if p.dim() > 1: +# nn.init.xavier_uniform_(p) +# +# def forward( +# self, +# src: Tensor, +# trg: Tensor, +# src_mask: Optional[Tensor] = None, +# trg_mask: Optional[Tensor] = None, +# memory_mask: Optional[Tensor] = None, +# ) -> Tensor: +# """Forward pass through the transformer.""" +# if src.shape[0] != trg.shape[0]: +# print(trg.shape) +# raise RuntimeError("The batch size of the src and trg must be the same.") +# if src.shape[2] != trg.shape[2]: +# raise RuntimeError( +# "The number of features for the src and trg must be the same." +# ) +# +# memory = self.encoder(src, src_mask) +# output = self.decoder(trg, memory, trg_mask, memory_mask) +# return output diff --git a/training/.gitignore b/training/.gitignore index 333c1e9..7d268ea 100644 --- a/training/.gitignore +++ b/training/.gitignore @@ -1 +1,2 @@ logs/ +outputs/ diff --git a/training/conf/callbacks/default.yaml b/training/conf/callbacks/default.yaml new file mode 100644 index 0000000..74dc30c --- /dev/null +++ b/training/conf/callbacks/default.yaml @@ -0,0 +1,14 @@ +# @package _group_ +- type: ModelCheckpoint + args: + monitor: val_loss + mode: min + save_last: true +- type: LearningRateMonitor + args: + logging_interval: step +# - type: EarlyStopping +# args: +# monitor: val_loss +# mode: min +# patience: 10 diff --git a/training/conf/callbacks/swa.yaml b/training/conf/callbacks/swa.yaml new file mode 100644 index 0000000..144ad6e --- /dev/null +++ b/training/conf/callbacks/swa.yaml @@ -0,0 +1,16 @@ +# @package _group_ +- type: ModelCheckpoint + args: + monitor: val_loss + mode: min + save_last: true +- type: StochasticWeightAveraging + args: + swa_epoch_start: 0.8 + swa_lrs: 0.05 + annealing_epochs: 10 + annealing_strategy: cos + device: null +- type: LearningRateMonitor + args: + logging_interval: step diff --git a/training/conf/cnn_transformer.yaml b/training/conf/cnn_transformer.yaml new file mode 100644 index 0000000..a4f16df --- /dev/null +++ b/training/conf/cnn_transformer.yaml @@ -0,0 +1,90 @@ +seed: 4711 + +network: + desc: Configuration of the PyTorch neural network. + type: CNNTransformer + args: + encoder: + type: EfficientNet + args: null + num_decoder_layers: 4 + vocab_size: 84 + hidden_dim: 256 + num_heads: 4 + expansion_dim: 1024 + dropout_rate: 0.1 + transformer_activation: glu + +model: + desc: Configuration of the PyTorch Lightning model. + type: LitTransformerModel + args: + optimizer: + type: MADGRAD + args: + lr: 1.0e-3 + momentum: 0.9 + weight_decay: 0 + eps: 1.0e-6 + lr_scheduler: + type: OneCycleLR + args: + interval: &interval step + max_lr: 1.0e-3 + three_phase: true + epochs: 512 + steps_per_epoch: 1246 # num_samples / batch_size + criterion: + type: CrossEntropyLoss + args: + weight: null + ignore_index: -100 + reduction: mean + monitor: val_loss + mapping: sentence_piece + +data: + desc: Configuration of the training/test data. + type: IAMExtendedParagraphs + args: + batch_size: 8 + num_workers: 12 + train_fraction: 0.8 + augment: true + +callbacks: + - type: ModelCheckpoint + args: + monitor: val_loss + mode: min + save_last: true + # - type: StochasticWeightAveraging + # args: + # swa_epoch_start: 0.8 + # swa_lrs: 0.05 + # annealing_epochs: 10 + # annealing_strategy: cos + # device: null + - type: LearningRateMonitor + args: + logging_interval: *interval + # - type: EarlyStopping + # args: + # monitor: val_loss + # mode: min + # patience: 10 + +trainer: + desc: Configuration of the PyTorch Lightning Trainer. + args: + stochastic_weight_avg: false + auto_scale_batch_size: binsearch + gradient_clip_val: 0 + fast_dev_run: true + gpus: 1 + precision: 16 + max_epochs: 512 + terminate_on_nan: true + weights_summary: top + +load_checkpoint: null diff --git a/training/conf/config.yaml b/training/conf/config.yaml new file mode 100644 index 0000000..11adeb7 --- /dev/null +++ b/training/conf/config.yaml @@ -0,0 +1,6 @@ +defaults: + - network: vqvae + - model: lit_vqvae + - dataset: iam_extended_paragraphs + - trainer: default + - callbacks: default diff --git a/training/conf/dataset/iam_extended_paragraphs.yaml b/training/conf/dataset/iam_extended_paragraphs.yaml new file mode 100644 index 0000000..6bd7fc9 --- /dev/null +++ b/training/conf/dataset/iam_extended_paragraphs.yaml @@ -0,0 +1,7 @@ +# @package _group_ +type: IAMExtendedParagraphs +args: + batch_size: 32 + num_workers: 12 + train_fraction: 0.8 + augment: true diff --git a/training/conf/model/lit_vqvae.yaml b/training/conf/model/lit_vqvae.yaml new file mode 100644 index 0000000..90780b7 --- /dev/null +++ b/training/conf/model/lit_vqvae.yaml @@ -0,0 +1,24 @@ +# @package _group_ +type: LitVQVAEModel +args: + optimizer: + type: MADGRAD + args: + lr: 1.0e-3 + momentum: 0.9 + weight_decay: 0 + eps: 1.0e-6 + lr_scheduler: + type: OneCycleLR + args: + interval: step + max_lr: 1.0e-3 + three_phase: true + epochs: 64 + steps_per_epoch: 633 # num_samples / batch_size + criterion: + type: MSELoss + args: + reduction: mean + monitor: val_loss + mapping: sentence_piece diff --git a/training/conf/network/vqvae.yaml b/training/conf/network/vqvae.yaml new file mode 100644 index 0000000..8c30bbd --- /dev/null +++ b/training/conf/network/vqvae.yaml @@ -0,0 +1,14 @@ +# @package _group_ +type: VQVAE +args: + in_channels: 1 + channels: [32, 64, 64] + kernel_sizes: [4, 4, 4] + strides: [2, 2, 2] + num_residual_layers: 2 + embedding_dim: 64 + num_embeddings: 256 + upsampling: null + beta: 0.25 + activation: leaky_relu + dropout_rate: 0.2 diff --git a/training/conf/trainer/default.yaml b/training/conf/trainer/default.yaml new file mode 100644 index 0000000..82afd93 --- /dev/null +++ b/training/conf/trainer/default.yaml @@ -0,0 +1,18 @@ +# @package _group_ +seed: 4711 +load_checkpoint: null +wandb: false +tune: false +train: true +test: true +logging: INFO +args: + stochastic_weight_avg: false + auto_scale_batch_size: binsearch + gradient_clip_val: 0 + fast_dev_run: false + gpus: 1 + precision: 16 + max_epochs: 64 + terminate_on_nan: true + weights_summary: top diff --git a/training/configs/cnn_transformer.yaml b/training/configs/cnn_transformer.yaml deleted file mode 100644 index a4f16df..0000000 --- a/training/configs/cnn_transformer.yaml +++ /dev/null @@ -1,90 +0,0 @@ -seed: 4711 - -network: - desc: Configuration of the PyTorch neural network. - type: CNNTransformer - args: - encoder: - type: EfficientNet - args: null - num_decoder_layers: 4 - vocab_size: 84 - hidden_dim: 256 - num_heads: 4 - expansion_dim: 1024 - dropout_rate: 0.1 - transformer_activation: glu - -model: - desc: Configuration of the PyTorch Lightning model. - type: LitTransformerModel - args: - optimizer: - type: MADGRAD - args: - lr: 1.0e-3 - momentum: 0.9 - weight_decay: 0 - eps: 1.0e-6 - lr_scheduler: - type: OneCycleLR - args: - interval: &interval step - max_lr: 1.0e-3 - three_phase: true - epochs: 512 - steps_per_epoch: 1246 # num_samples / batch_size - criterion: - type: CrossEntropyLoss - args: - weight: null - ignore_index: -100 - reduction: mean - monitor: val_loss - mapping: sentence_piece - -data: - desc: Configuration of the training/test data. - type: IAMExtendedParagraphs - args: - batch_size: 8 - num_workers: 12 - train_fraction: 0.8 - augment: true - -callbacks: - - type: ModelCheckpoint - args: - monitor: val_loss - mode: min - save_last: true - # - type: StochasticWeightAveraging - # args: - # swa_epoch_start: 0.8 - # swa_lrs: 0.05 - # annealing_epochs: 10 - # annealing_strategy: cos - # device: null - - type: LearningRateMonitor - args: - logging_interval: *interval - # - type: EarlyStopping - # args: - # monitor: val_loss - # mode: min - # patience: 10 - -trainer: - desc: Configuration of the PyTorch Lightning Trainer. - args: - stochastic_weight_avg: false - auto_scale_batch_size: binsearch - gradient_clip_val: 0 - fast_dev_run: true - gpus: 1 - precision: 16 - max_epochs: 512 - terminate_on_nan: true - weights_summary: top - -load_checkpoint: null diff --git a/training/configs/vqvae.yaml b/training/configs/vqvae.yaml deleted file mode 100644 index 13d7c97..0000000 --- a/training/configs/vqvae.yaml +++ /dev/null @@ -1,89 +0,0 @@ -seed: 4711 - -network: - desc: Configuration of the PyTorch neural network. - type: VQVAE - args: - in_channels: 1 - channels: [32, 64, 64, 96, 96] - kernel_sizes: [4, 4, 4, 4, 4] - strides: [2, 2, 2, 2, 2] - num_residual_layers: 2 - embedding_dim: 512 - num_embeddings: 1024 - upsampling: null - beta: 0.25 - activation: leaky_relu - dropout_rate: 0.2 - -model: - desc: Configuration of the PyTorch Lightning model. - type: LitVQVAEModel - args: - optimizer: - type: MADGRAD - args: - lr: 1.0e-3 - momentum: 0.9 - weight_decay: 0 - eps: 1.0e-6 - lr_scheduler: - type: OneCycleLR - args: - interval: &interval step - max_lr: 1.0e-3 - three_phase: true - epochs: 64 - steps_per_epoch: 633 # num_samples / batch_size - criterion: - type: MSELoss - args: - reduction: mean - monitor: val_loss - mapping: sentence_piece - -data: - desc: Configuration of the training/test data. - type: IAMExtendedParagraphs - args: - batch_size: 32 - num_workers: 12 - train_fraction: 0.8 - augment: true - -callbacks: - - type: ModelCheckpoint - args: - monitor: val_loss - mode: min - save_last: true - - type: StochasticWeightAveraging - args: - swa_epoch_start: 0.8 - swa_lrs: 0.05 - annealing_epochs: 10 - annealing_strategy: cos - device: null - - type: LearningRateMonitor - args: - logging_interval: *interval - # - type: EarlyStopping - # args: - # monitor: val_loss - # mode: min - # patience: 10 - -trainer: - desc: Configuration of the PyTorch Lightning Trainer. - args: - stochastic_weight_avg: true - auto_scale_batch_size: binsearch - gradient_clip_val: 0 - fast_dev_run: false - gpus: 1 - precision: 16 - max_epochs: 64 - terminate_on_nan: true - weights_summary: top - -load_checkpoint: null diff --git a/training/run_experiment.py b/training/run_experiment.py index bdefbf0..2b3ecab 100644 --- a/training/run_experiment.py +++ b/training/run_experiment.py @@ -4,17 +4,15 @@ import importlib from pathlib import Path from typing import Dict, List, Optional, Type -import click +import hydra from loguru import logger -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig import pytorch_lightning as pl from torch import nn from tqdm import tqdm import wandb -SEED = 4711 -CONFIGS_DIRNAME = Path(__file__).parent.resolve() / "configs" LOGS_DIRNAME = Path(__file__).parent.resolve() / "logs" @@ -29,21 +27,10 @@ def _create_experiment_dir(config: DictConfig) -> Path: return log_dir -def _configure_logging(log_dir: Optional[Path], verbose: int = 0) -> None: +def _configure_logging(log_dir: Optional[Path], level: str) -> None: """Configure the loguru logger for output to terminal and disk.""" - - def _get_level(verbose: int) -> str: - """Sets the logger level.""" - levels = {0: "WARNING", 1: "INFO", 2: "DEBUG"} - verbose = min(verbose, 2) - return levels[verbose] - # Remove default logger to get tqdm to work properly. logger.remove() - - # Fetch verbosity level. - level = _get_level(verbose) - logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True, level=level) if log_dir is not None: logger.add( @@ -52,14 +39,6 @@ def _configure_logging(log_dir: Optional[Path], verbose: int = 0) -> None: ) -def _load_config(file_path: Path) -> DictConfig: - """Return experiment config.""" - logger.info(f"Loading config from: {file_path}") - if not file_path.exists(): - raise FileNotFoundError(f"Experiment config not found at: {file_path}") - return OmegaConf.load(file_path) - - def _import_class(module_and_class_name: str) -> type: """Import class from module.""" module_name, class_name = module_and_class_name.rsplit(".", 1) @@ -78,14 +57,16 @@ def _configure_callbacks( def _configure_logger( - network: Type[nn.Module], args: Dict, log_dir: Path, use_wandb: bool + network: Type[nn.Module], config: DictConfig, log_dir: Path ) -> Type[pl.loggers.LightningLoggerBase]: """Configures lightning logger.""" - if use_wandb: + if config.trainer.wandb: + logger.info("Logging model with W&B") pl_logger = pl.loggers.WandbLogger(save_dir=str(log_dir)) pl_logger.watch(network) - pl_logger.log_hyperparams(vars(args)) + pl_logger.log_hyperparams(vars(config)) return pl_logger + logger.info("Logging model with Tensorboard") return pl.loggers.TensorBoardLogger(save_dir=str(log_dir)) @@ -110,50 +91,36 @@ def _load_lit_model( lit_model_class: type, network: Type[nn.Module], config: DictConfig ) -> Type[pl.LightningModule]: """Load lightning model.""" - if config.load_checkpoint is not None: + if config.trainer.load_checkpoint is not None: logger.info( - f"Loading network weights from checkpoint: {config.load_checkpoint}" + f"Loading network weights from checkpoint: {config.trainer.load_checkpoint}" ) return lit_model_class.load_from_checkpoint( - config.load_checkpoint, network=network, **config.model.args + config.trainer.load_checkpoint, network=network, **config.model.args ) return lit_model_class(network=network, **config.model.args) -def run( - filename: str, - fast_dev_run: bool, - train: bool, - test: bool, - tune: bool, - use_wandb: bool, - verbose: int = 0, -) -> None: +def run(config: DictConfig) -> None: """Runs experiment.""" - # Load config. - file_path = CONFIGS_DIRNAME / filename - config = _load_config(file_path) - log_dir = _create_experiment_dir(config) - _configure_logging(log_dir, verbose=verbose) + _configure_logging(log_dir, level=config.trainer.logging) logger.info("Starting experiment...") - # Seed everything in the experiment. - logger.info(f"Seeding everthing with seed={SEED}") - pl.utilities.seed.seed_everything(SEED) + pl.utilities.seed.seed_everything(config.trainer.seed) # Load classes. - data_module_class = _import_class(f"text_recognizer.data.{config.data.type}") + data_module_class = _import_class(f"text_recognizer.data.{config.dataset.type}") network_class = _import_class(f"text_recognizer.networks.{config.network.type}") lit_model_class = _import_class(f"text_recognizer.models.{config.model.type}") # Initialize data object and network. - data_module = data_module_class(**config.data.args) + data_module = data_module_class(**config.dataset.args) network = network_class(**data_module.config(), **config.network.args) # Load callback and logger. callbacks = _configure_callbacks(config.callbacks) - pl_logger = _configure_logger(network, config, log_dir, use_wandb) + pl_logger = _configure_logger(network, config, log_dir) # Load ligtning model. lit_model = _load_lit_model(lit_model_class, network, config) @@ -164,55 +131,28 @@ def run( logger=pl_logger, weights_save_path=str(log_dir), ) - if fast_dev_run: - logger.info("Fast dev run...") + + if config.trainer.tune and not config.trainer.args.fast_dev_run: + logger.info("Tuning learning rate and batch size...") + trainer.tune(lit_model, datamodule=data_module) + + if config.trainer.train: + logger.info("Training network...") trainer.fit(lit_model, datamodule=data_module) - else: - if tune: - logger.info("Tuning learning rate and batch size...") - trainer.tune(lit_model, datamodule=data_module) - - if train: - logger.info("Training network...") - trainer.fit(lit_model, datamodule=data_module) - - if test: - logger.info("Testing network...") - trainer.test(lit_model, datamodule=data_module) - - _save_best_weights(callbacks, use_wandb) - - -@click.command() -@click.option("-f", "--experiment_config", type=str, help="Path to experiment config.") -@click.option("--use_wandb", is_flag=True, help="If true, do use wandb for logging.") -@click.option("--dev", is_flag=True, help="If true, run a fast dev run.") -@click.option( - "--tune", is_flag=True, help="If true, tune hyperparameters for training." -) -@click.option("-t", "--train", is_flag=True, help="If true, train the model.") -@click.option("-e", "--test", is_flag=True, help="If true, test the model.") -@click.option("-v", "--verbose", count=True) -def cli( - experiment_config: str, - use_wandb: bool, - dev: bool, - tune: bool, - train: bool, - test: bool, - verbose: int, -) -> None: - """Run experiment.""" - run( - filename=experiment_config, - fast_dev_run=dev, - train=train, - test=test, - tune=tune, - use_wandb=use_wandb, - verbose=verbose, - ) + + if config.trainer.test and not config.trainer.args.fast_dev_run: + logger.info("Testing network...") + trainer.test(lit_model, datamodule=data_module) + + if not config.trainer.args.fast_dev_run: + _save_best_weights(callbacks, config.trainer.wandb) + + +@hydra.main(config_path="conf", config_name="config") +def main(cfg: DictConfig) -> None: + """Loads config with hydra.""" + run(cfg) if __name__ == "__main__": - cli() + main() -- cgit v1.2.3-70-g09d2