From 1d0977585f01c42e9f6280559a1a98037907a62e Mon Sep 17 00:00:00 2001
From: Gustaf Rydholm <gustaf.rydholm@gmail.com>
Date: Sun, 2 May 2021 13:51:15 +0200
Subject: Implemented training script with hydra

---
 training/.gitignore                                |   1 +
 training/conf/callbacks/default.yaml               |  14 +++
 training/conf/callbacks/swa.yaml                   |  16 +++
 training/conf/cnn_transformer.yaml                 |  90 ++++++++++++++
 training/conf/config.yaml                          |   6 +
 training/conf/dataset/iam_extended_paragraphs.yaml |   7 ++
 training/conf/model/lit_vqvae.yaml                 |  24 ++++
 training/conf/network/vqvae.yaml                   |  14 +++
 training/conf/trainer/default.yaml                 |  18 +++
 training/configs/cnn_transformer.yaml              |  90 --------------
 training/configs/vqvae.yaml                        |  89 --------------
 training/run_experiment.py                         | 136 ++++++---------------
 12 files changed, 228 insertions(+), 277 deletions(-)
 create mode 100644 training/conf/callbacks/default.yaml
 create mode 100644 training/conf/callbacks/swa.yaml
 create mode 100644 training/conf/cnn_transformer.yaml
 create mode 100644 training/conf/config.yaml
 create mode 100644 training/conf/dataset/iam_extended_paragraphs.yaml
 create mode 100644 training/conf/model/lit_vqvae.yaml
 create mode 100644 training/conf/network/vqvae.yaml
 create mode 100644 training/conf/trainer/default.yaml
 delete mode 100644 training/configs/cnn_transformer.yaml
 delete mode 100644 training/configs/vqvae.yaml

(limited to 'training')

diff --git a/training/.gitignore b/training/.gitignore
index 333c1e9..7d268ea 100644
--- a/training/.gitignore
+++ b/training/.gitignore
@@ -1 +1,2 @@
 logs/
+outputs/
diff --git a/training/conf/callbacks/default.yaml b/training/conf/callbacks/default.yaml
new file mode 100644
index 0000000..74dc30c
--- /dev/null
+++ b/training/conf/callbacks/default.yaml
@@ -0,0 +1,14 @@
+# @package _group_
+- type: ModelCheckpoint
+  args:
+      monitor: val_loss
+      mode: min
+      save_last: true
+- type: LearningRateMonitor
+  args:
+      logging_interval: step
+# - type: EarlyStopping
+#   args:
+#       monitor: val_loss
+#       mode: min
+#       patience: 10
diff --git a/training/conf/callbacks/swa.yaml b/training/conf/callbacks/swa.yaml
new file mode 100644
index 0000000..144ad6e
--- /dev/null
+++ b/training/conf/callbacks/swa.yaml
@@ -0,0 +1,16 @@
+# @package _group_
+- type: ModelCheckpoint
+  args:
+      monitor: val_loss
+      mode: min
+      save_last: true
+- type: StochasticWeightAveraging
+  args:
+      swa_epoch_start: 0.8
+      swa_lrs: 0.05
+      annealing_epochs: 10
+      annealing_strategy: cos
+      device: null
+- type: LearningRateMonitor
+  args:
+      logging_interval: step
diff --git a/training/conf/cnn_transformer.yaml b/training/conf/cnn_transformer.yaml
new file mode 100644
index 0000000..a4f16df
--- /dev/null
+++ b/training/conf/cnn_transformer.yaml
@@ -0,0 +1,90 @@
+seed: 4711
+
+network:
+        desc: Configuration of the PyTorch neural network.
+        type: CNNTransformer
+        args:
+                encoder:
+                        type: EfficientNet
+                        args: null
+                num_decoder_layers: 4
+                vocab_size: 84
+                hidden_dim: 256
+                num_heads: 4
+                expansion_dim: 1024
+                dropout_rate: 0.1
+                transformer_activation: glu
+
+model:
+        desc: Configuration of the PyTorch Lightning model.
+        type: LitTransformerModel
+        args:
+                optimizer:
+                        type: MADGRAD
+                        args:
+                                lr: 1.0e-3
+                                momentum: 0.9
+                                weight_decay: 0
+                                eps: 1.0e-6
+                lr_scheduler:
+                        type: OneCycleLR
+                        args:
+                                interval: &interval step
+                                max_lr: 1.0e-3
+                                three_phase: true
+                                epochs: 512
+                                steps_per_epoch: 1246 # num_samples / batch_size
+                criterion:
+                        type: CrossEntropyLoss
+                        args:
+                                weight: null
+                                ignore_index: -100
+                                reduction: mean
+                monitor: val_loss
+                mapping: sentence_piece
+
+data:
+        desc: Configuration of the training/test data.
+        type: IAMExtendedParagraphs
+        args:
+                batch_size: 8
+                num_workers: 12
+                train_fraction: 0.8
+                augment: true
+
+callbacks:
+        - type: ModelCheckpoint
+          args:
+                  monitor: val_loss
+                  mode: min
+                  save_last: true
+        # - type: StochasticWeightAveraging
+        #   args:
+        #           swa_epoch_start: 0.8
+        #           swa_lrs: 0.05
+        #           annealing_epochs: 10
+        #           annealing_strategy: cos
+        #           device: null
+        - type: LearningRateMonitor
+          args:
+                  logging_interval: *interval
+        # - type: EarlyStopping
+        #   args:
+        #           monitor: val_loss
+        #           mode: min
+        #           patience: 10
+
+trainer:
+        desc: Configuration of the PyTorch Lightning Trainer.
+        args:
+                stochastic_weight_avg: false
+                auto_scale_batch_size: binsearch
+                gradient_clip_val: 0
+                fast_dev_run: true
+                gpus: 1
+                precision: 16
+                max_epochs: 512
+                terminate_on_nan: true
+                weights_summary: top
+
+load_checkpoint: null 
diff --git a/training/conf/config.yaml b/training/conf/config.yaml
new file mode 100644
index 0000000..11adeb7
--- /dev/null
+++ b/training/conf/config.yaml
@@ -0,0 +1,6 @@
+defaults:
+    - network: vqvae
+    - model: lit_vqvae
+    - dataset: iam_extended_paragraphs
+    - trainer: default
+    - callbacks: default
diff --git a/training/conf/dataset/iam_extended_paragraphs.yaml b/training/conf/dataset/iam_extended_paragraphs.yaml
new file mode 100644
index 0000000..6bd7fc9
--- /dev/null
+++ b/training/conf/dataset/iam_extended_paragraphs.yaml
@@ -0,0 +1,7 @@
+# @package _group_
+type: IAMExtendedParagraphs
+args:
+    batch_size: 32
+    num_workers: 12
+    train_fraction: 0.8
+    augment: true
diff --git a/training/conf/model/lit_vqvae.yaml b/training/conf/model/lit_vqvae.yaml
new file mode 100644
index 0000000..90780b7
--- /dev/null
+++ b/training/conf/model/lit_vqvae.yaml
@@ -0,0 +1,24 @@
+# @package _group_
+type: LitVQVAEModel
+args:
+    optimizer:
+        type: MADGRAD
+        args:
+            lr: 1.0e-3
+            momentum: 0.9
+            weight_decay: 0
+            eps: 1.0e-6
+    lr_scheduler:
+        type: OneCycleLR
+        args:
+            interval: step
+            max_lr: 1.0e-3
+            three_phase: true
+            epochs: 64
+            steps_per_epoch: 633 # num_samples / batch_size
+    criterion:
+        type: MSELoss
+        args:
+            reduction: mean
+    monitor: val_loss
+    mapping: sentence_piece
diff --git a/training/conf/network/vqvae.yaml b/training/conf/network/vqvae.yaml
new file mode 100644
index 0000000..8c30bbd
--- /dev/null
+++ b/training/conf/network/vqvae.yaml
@@ -0,0 +1,14 @@
+# @package _group_
+type: VQVAE
+args:
+    in_channels: 1
+    channels: [32, 64, 64]
+    kernel_sizes: [4, 4, 4]
+    strides: [2, 2, 2]
+    num_residual_layers: 2 
+    embedding_dim: 64
+    num_embeddings: 256
+    upsampling: null
+    beta: 0.25
+    activation: leaky_relu
+    dropout_rate: 0.2
diff --git a/training/conf/trainer/default.yaml b/training/conf/trainer/default.yaml
new file mode 100644
index 0000000..82afd93
--- /dev/null
+++ b/training/conf/trainer/default.yaml
@@ -0,0 +1,18 @@
+# @package _group_
+seed: 4711
+load_checkpoint: null 
+wandb: false
+tune: false
+train: true
+test: true
+logging: INFO
+args:
+    stochastic_weight_avg: false
+    auto_scale_batch_size: binsearch
+    gradient_clip_val: 0
+    fast_dev_run: false
+    gpus: 1
+    precision: 16
+    max_epochs: 64
+    terminate_on_nan: true
+    weights_summary: top
diff --git a/training/configs/cnn_transformer.yaml b/training/configs/cnn_transformer.yaml
deleted file mode 100644
index a4f16df..0000000
--- a/training/configs/cnn_transformer.yaml
+++ /dev/null
@@ -1,90 +0,0 @@
-seed: 4711
-
-network:
-        desc: Configuration of the PyTorch neural network.
-        type: CNNTransformer
-        args:
-                encoder:
-                        type: EfficientNet
-                        args: null
-                num_decoder_layers: 4
-                vocab_size: 84
-                hidden_dim: 256
-                num_heads: 4
-                expansion_dim: 1024
-                dropout_rate: 0.1
-                transformer_activation: glu
-
-model:
-        desc: Configuration of the PyTorch Lightning model.
-        type: LitTransformerModel
-        args:
-                optimizer:
-                        type: MADGRAD
-                        args:
-                                lr: 1.0e-3
-                                momentum: 0.9
-                                weight_decay: 0
-                                eps: 1.0e-6
-                lr_scheduler:
-                        type: OneCycleLR
-                        args:
-                                interval: &interval step
-                                max_lr: 1.0e-3
-                                three_phase: true
-                                epochs: 512
-                                steps_per_epoch: 1246 # num_samples / batch_size
-                criterion:
-                        type: CrossEntropyLoss
-                        args:
-                                weight: null
-                                ignore_index: -100
-                                reduction: mean
-                monitor: val_loss
-                mapping: sentence_piece
-
-data:
-        desc: Configuration of the training/test data.
-        type: IAMExtendedParagraphs
-        args:
-                batch_size: 8
-                num_workers: 12
-                train_fraction: 0.8
-                augment: true
-
-callbacks:
-        - type: ModelCheckpoint
-          args:
-                  monitor: val_loss
-                  mode: min
-                  save_last: true
-        # - type: StochasticWeightAveraging
-        #   args:
-        #           swa_epoch_start: 0.8
-        #           swa_lrs: 0.05
-        #           annealing_epochs: 10
-        #           annealing_strategy: cos
-        #           device: null
-        - type: LearningRateMonitor
-          args:
-                  logging_interval: *interval
-        # - type: EarlyStopping
-        #   args:
-        #           monitor: val_loss
-        #           mode: min
-        #           patience: 10
-
-trainer:
-        desc: Configuration of the PyTorch Lightning Trainer.
-        args:
-                stochastic_weight_avg: false
-                auto_scale_batch_size: binsearch
-                gradient_clip_val: 0
-                fast_dev_run: true
-                gpus: 1
-                precision: 16
-                max_epochs: 512
-                terminate_on_nan: true
-                weights_summary: top
-
-load_checkpoint: null 
diff --git a/training/configs/vqvae.yaml b/training/configs/vqvae.yaml
deleted file mode 100644
index 13d7c97..0000000
--- a/training/configs/vqvae.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-seed: 4711
-
-network:
-        desc: Configuration of the PyTorch neural network.
-        type: VQVAE
-        args:
-            in_channels: 1
-            channels: [32, 64, 64, 96, 96]
-            kernel_sizes: [4, 4, 4, 4, 4]
-            strides: [2, 2, 2, 2, 2]
-            num_residual_layers: 2 
-            embedding_dim: 512
-            num_embeddings: 1024
-            upsampling: null
-            beta: 0.25
-            activation: leaky_relu
-            dropout_rate: 0.2
-
-model:
-        desc: Configuration of the PyTorch Lightning model.
-        type: LitVQVAEModel
-        args:
-                optimizer:
-                        type: MADGRAD
-                        args:
-                                lr: 1.0e-3
-                                momentum: 0.9
-                                weight_decay: 0
-                                eps: 1.0e-6
-                lr_scheduler:
-                        type: OneCycleLR
-                        args:
-                                interval: &interval step
-                                max_lr: 1.0e-3
-                                three_phase: true
-                                epochs: 64
-                                steps_per_epoch: 633 # num_samples / batch_size
-                criterion:
-                        type: MSELoss
-                        args:
-                                reduction: mean
-                monitor: val_loss
-                mapping: sentence_piece
-
-data:
-        desc: Configuration of the training/test data.
-        type: IAMExtendedParagraphs
-        args:
-                batch_size: 32
-                num_workers: 12
-                train_fraction: 0.8
-                augment: true
-
-callbacks:
-        - type: ModelCheckpoint
-          args:
-                  monitor: val_loss
-                  mode: min
-                  save_last: true
-        - type: StochasticWeightAveraging
-          args:
-                  swa_epoch_start: 0.8
-                  swa_lrs: 0.05
-                  annealing_epochs: 10
-                  annealing_strategy: cos
-                  device: null
-        - type: LearningRateMonitor
-          args:
-                  logging_interval: *interval
-        # - type: EarlyStopping
-        #   args:
-        #           monitor: val_loss
-        #           mode: min
-        #           patience: 10
-
-trainer:
-        desc: Configuration of the PyTorch Lightning Trainer.
-        args:
-                stochastic_weight_avg: true
-                auto_scale_batch_size: binsearch
-                gradient_clip_val: 0
-                fast_dev_run: false
-                gpus: 1
-                precision: 16
-                max_epochs: 64
-                terminate_on_nan: true
-                weights_summary: top
-
-load_checkpoint: null 
diff --git a/training/run_experiment.py b/training/run_experiment.py
index bdefbf0..2b3ecab 100644
--- a/training/run_experiment.py
+++ b/training/run_experiment.py
@@ -4,17 +4,15 @@ import importlib
 from pathlib import Path
 from typing import Dict, List, Optional, Type
 
-import click
+import hydra
 from loguru import logger
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig
 import pytorch_lightning as pl
 from torch import nn
 from tqdm import tqdm
 import wandb
 
 
-SEED = 4711
-CONFIGS_DIRNAME = Path(__file__).parent.resolve() / "configs"
 LOGS_DIRNAME = Path(__file__).parent.resolve() / "logs"
 
 
@@ -29,21 +27,10 @@ def _create_experiment_dir(config: DictConfig) -> Path:
     return log_dir
 
 
-def _configure_logging(log_dir: Optional[Path], verbose: int = 0) -> None:
+def _configure_logging(log_dir: Optional[Path], level: str) -> None:
     """Configure the loguru logger for output to terminal and disk."""
-
-    def _get_level(verbose: int) -> str:
-        """Sets the logger level."""
-        levels = {0: "WARNING", 1: "INFO", 2: "DEBUG"}
-        verbose = min(verbose, 2)
-        return levels[verbose]
-
     # Remove default logger to get tqdm to work properly.
     logger.remove()
-
-    # Fetch verbosity level.
-    level = _get_level(verbose)
-
     logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True, level=level)
     if log_dir is not None:
         logger.add(
@@ -52,14 +39,6 @@ def _configure_logging(log_dir: Optional[Path], verbose: int = 0) -> None:
         )
 
 
-def _load_config(file_path: Path) -> DictConfig:
-    """Return experiment config."""
-    logger.info(f"Loading config from: {file_path}")
-    if not file_path.exists():
-        raise FileNotFoundError(f"Experiment config not found at: {file_path}")
-    return OmegaConf.load(file_path)
-
-
 def _import_class(module_and_class_name: str) -> type:
     """Import class from module."""
     module_name, class_name = module_and_class_name.rsplit(".", 1)
@@ -78,14 +57,16 @@ def _configure_callbacks(
 
 
 def _configure_logger(
-    network: Type[nn.Module], args: Dict, log_dir: Path, use_wandb: bool
+    network: Type[nn.Module], config: DictConfig, log_dir: Path
 ) -> Type[pl.loggers.LightningLoggerBase]:
     """Configures lightning logger."""
-    if use_wandb:
+    if config.trainer.wandb:
+        logger.info("Logging model with W&B")
         pl_logger = pl.loggers.WandbLogger(save_dir=str(log_dir))
         pl_logger.watch(network)
-        pl_logger.log_hyperparams(vars(args))
+        pl_logger.log_hyperparams(vars(config))
         return pl_logger
+    logger.info("Logging model with Tensorboard")
     return pl.loggers.TensorBoardLogger(save_dir=str(log_dir))
 
 
@@ -110,50 +91,36 @@ def _load_lit_model(
     lit_model_class: type, network: Type[nn.Module], config: DictConfig
 ) -> Type[pl.LightningModule]:
     """Load lightning model."""
-    if config.load_checkpoint is not None:
+    if config.trainer.load_checkpoint is not None:
         logger.info(
-            f"Loading network weights from checkpoint: {config.load_checkpoint}"
+            f"Loading network weights from checkpoint: {config.trainer.load_checkpoint}"
         )
         return lit_model_class.load_from_checkpoint(
-            config.load_checkpoint, network=network, **config.model.args
+            config.trainer.load_checkpoint, network=network, **config.model.args
         )
     return lit_model_class(network=network, **config.model.args)
 
 
-def run(
-    filename: str,
-    fast_dev_run: bool,
-    train: bool,
-    test: bool,
-    tune: bool,
-    use_wandb: bool,
-    verbose: int = 0,
-) -> None:
+def run(config: DictConfig) -> None:
     """Runs experiment."""
-    # Load config.
-    file_path = CONFIGS_DIRNAME / filename
-    config = _load_config(file_path)
-
     log_dir = _create_experiment_dir(config)
-    _configure_logging(log_dir, verbose=verbose)
+    _configure_logging(log_dir, level=config.trainer.logging)
     logger.info("Starting experiment...")
 
-    # Seed everything in the experiment.
-    logger.info(f"Seeding everthing with seed={SEED}")
-    pl.utilities.seed.seed_everything(SEED)
+    pl.utilities.seed.seed_everything(config.trainer.seed)
 
     # Load classes.
-    data_module_class = _import_class(f"text_recognizer.data.{config.data.type}")
+    data_module_class = _import_class(f"text_recognizer.data.{config.dataset.type}")
     network_class = _import_class(f"text_recognizer.networks.{config.network.type}")
     lit_model_class = _import_class(f"text_recognizer.models.{config.model.type}")
 
     # Initialize data object and network.
-    data_module = data_module_class(**config.data.args)
+    data_module = data_module_class(**config.dataset.args)
     network = network_class(**data_module.config(), **config.network.args)
 
     # Load callback and logger.
     callbacks = _configure_callbacks(config.callbacks)
-    pl_logger = _configure_logger(network, config, log_dir, use_wandb)
+    pl_logger = _configure_logger(network, config, log_dir)
 
     # Load ligtning model.
     lit_model = _load_lit_model(lit_model_class, network, config)
@@ -164,55 +131,28 @@ def run(
         logger=pl_logger,
         weights_save_path=str(log_dir),
     )
-    if fast_dev_run:
-        logger.info("Fast dev run...")
+
+    if config.trainer.tune and not config.trainer.args.fast_dev_run:
+        logger.info("Tuning learning rate and batch size...")
+        trainer.tune(lit_model, datamodule=data_module)
+
+    if config.trainer.train:
+        logger.info("Training network...")
         trainer.fit(lit_model, datamodule=data_module)
-    else:
-        if tune:
-            logger.info("Tuning learning rate and batch size...")
-            trainer.tune(lit_model, datamodule=data_module)
-
-        if train:
-            logger.info("Training network...")
-            trainer.fit(lit_model, datamodule=data_module)
-
-        if test:
-            logger.info("Testing network...")
-            trainer.test(lit_model, datamodule=data_module)
-
-        _save_best_weights(callbacks, use_wandb)
-
-
-@click.command()
-@click.option("-f", "--experiment_config", type=str, help="Path to experiment config.")
-@click.option("--use_wandb", is_flag=True, help="If true, do use wandb for logging.")
-@click.option("--dev", is_flag=True, help="If true, run a fast dev run.")
-@click.option(
-    "--tune", is_flag=True, help="If true, tune hyperparameters for training."
-)
-@click.option("-t", "--train", is_flag=True, help="If true, train the model.")
-@click.option("-e", "--test", is_flag=True, help="If true, test the model.")
-@click.option("-v", "--verbose", count=True)
-def cli(
-    experiment_config: str,
-    use_wandb: bool,
-    dev: bool,
-    tune: bool,
-    train: bool,
-    test: bool,
-    verbose: int,
-) -> None:
-    """Run experiment."""
-    run(
-        filename=experiment_config,
-        fast_dev_run=dev,
-        train=train,
-        test=test,
-        tune=tune,
-        use_wandb=use_wandb,
-        verbose=verbose,
-    )
+
+    if config.trainer.test and not config.trainer.args.fast_dev_run:
+        logger.info("Testing network...")
+        trainer.test(lit_model, datamodule=data_module)
+
+    if not config.trainer.args.fast_dev_run:
+        _save_best_weights(callbacks, config.trainer.wandb)
+
+
+@hydra.main(config_path="conf", config_name="config")
+def main(cfg: DictConfig) -> None:
+    """Loads config with hydra."""
+    run(cfg)
 
 
 if __name__ == "__main__":
-    cli()
+    main()
-- 
cgit v1.2.3-70-g09d2