summaryrefslogtreecommitdiff
path: root/src/training
diff options
context:
space:
mode:
Diffstat (limited to 'src/training')
-rw-r--r--src/training/prepare_experiments.py2
-rw-r--r--src/training/run_experiment.py28
-rw-r--r--src/training/trainer/callbacks/lr_schedulers.py5
-rw-r--r--src/training/trainer/callbacks/wandb_callbacks.py2
-rw-r--r--src/training/trainer/population_based_training/__init__.py1
-rw-r--r--src/training/trainer/population_based_training/population_based_training.py1
-rw-r--r--src/training/trainer/train.py9
7 files changed, 31 insertions, 17 deletions
diff --git a/src/training/prepare_experiments.py b/src/training/prepare_experiments.py
index e00540c..6e20bcd 100644
--- a/src/training/prepare_experiments.py
+++ b/src/training/prepare_experiments.py
@@ -1,9 +1,7 @@
"""Run a experiment from a config file."""
import json
-from subprocess import run
import click
-from loguru import logger
import yaml
diff --git a/src/training/run_experiment.py b/src/training/run_experiment.py
index c0f969d..0510d5c 100644
--- a/src/training/run_experiment.py
+++ b/src/training/run_experiment.py
@@ -73,7 +73,7 @@ def _create_experiment_dir(
return experiment_dir, log_dir, model_dir
-def _load_modules_and_arguments(experiment_config: Dict) -> Tuple[Callable, Dict]:
+def _load_modules_and_arguments(experiment_config: Dict,) -> Tuple[Callable, Dict]:
"""Loads all modules and arguments."""
# Load the dataset module.
dataset_args = experiment_config.get("dataset", {})
@@ -104,7 +104,7 @@ def _load_modules_and_arguments(experiment_config: Dict) -> Tuple[Callable, Dict
criterion_ = getattr(custom_loss_module, experiment_config["criterion"]["type"])
else:
criterion_ = getattr(torch.nn, experiment_config["criterion"]["type"])
- criterion_args = experiment_config["criterion"].get("args", {})
+ criterion_args = experiment_config["criterion"].get("args", {}) or {}
# Optimizers
if experiment_config["optimizer"]["type"] == "AdaBelief":
@@ -187,18 +187,20 @@ def _save_config(experiment_dir: Path, experiment_config: Dict) -> None:
def _load_from_checkpoint(
- model: Type[Model], log_dir: Path, model_dir: Path, pretrained_weights: str = None
+ model: Type[Model], model_dir: Path, pretrained_weights: str = None,
) -> None:
"""If checkpoint exists, load model weights and optimizers from checkpoint."""
# Get checkpoint path.
if pretrained_weights is not None:
logger.info(f"Loading weights from {pretrained_weights}.")
- checkpoint_path = Path(pretrained_weights) / "model" / "best.pt"
+ checkpoint_path = (
+ EXPERIMENTS_DIRNAME / Path(pretrained_weights) / "model" / "best.pt"
+ )
else:
logger.info(f"Loading weights from {model_dir}.")
checkpoint_path = model_dir / "last.pt"
if checkpoint_path.exists():
- logger.info("Loading and resuming training from last checkpoint.")
+ logger.info("Loading and resuming training from checkpoint.")
model.load_from_checkpoint(checkpoint_path)
@@ -230,9 +232,9 @@ def run_experiment(
experiment_config: Dict,
save_weights: bool,
device: str,
- use_wandb: bool = False,
- train: bool = True,
- test: bool = False,
+ use_wandb: bool,
+ train: bool,
+ test: bool,
verbose: int = 0,
checkpoint: Optional[str] = None,
pretrained_weights: Optional[str] = None,
@@ -264,7 +266,7 @@ def run_experiment(
resume = False
if checkpoint is not None or pretrained_weights is not None:
resume = True
- _load_from_checkpoint(model, log_dir, model_dir, pretrained_weights)
+ _load_from_checkpoint(model, model_dir, pretrained_weights)
logger.info(f"The class mapping is {model.mapping}")
@@ -297,6 +299,7 @@ def run_experiment(
max_epochs=experiment_config["train_args"]["max_epochs"],
callbacks=callbacks,
transformer_model=experiment_config["train_args"]["transformer_model"],
+ max_norm=experiment_config["train_args"]["max_norm"],
)
# Train the model.
@@ -309,7 +312,7 @@ def run_experiment(
model.load_from_checkpoint(model_dir / "best.pt")
logger.info("Running inference on test set.")
- if experiment_config["criterion"]["type"] in custom_loss_module.__all__:
+ if experiment_config["criterion"]["type"] == "EmbeddingLoss":
logger.info("Evaluating embedding.")
score = evaluate_embedding(model)
else:
@@ -341,13 +344,15 @@ def run_experiment(
@click.option(
"--nowandb", is_flag=False, help="If true, do not use wandb for this run."
)
-@click.option("--notrain", is_flag=False, help="Do not train the model.")
@click.option("--test", is_flag=True, help="If true, test the model.")
@click.option("-v", "--verbose", count=True)
@click.option("--checkpoint", type=str, help="Path to the experiment.")
@click.option(
"--pretrained_weights", type=str, help="Path to pretrained model weights."
)
+@click.option(
+ "--notrain", is_flag=False, is_eager=True, help="Do not train the model.",
+)
def run_cli(
experiment_config: str,
gpu: int,
@@ -367,6 +372,7 @@ def run_cli(
experiment_config = json.loads(experiment_config)
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu}"
+
run_experiment(
experiment_config,
save,
diff --git a/src/training/trainer/callbacks/lr_schedulers.py b/src/training/trainer/callbacks/lr_schedulers.py
index 907e292..630c434 100644
--- a/src/training/trainer/callbacks/lr_schedulers.py
+++ b/src/training/trainer/callbacks/lr_schedulers.py
@@ -22,7 +22,10 @@ class LRScheduler(Callback):
def on_epoch_end(self, epoch: int, logs: Optional[Dict] = None) -> None:
"""Takes a step at the end of every epoch."""
if self.interval == "epoch":
- self.lr_scheduler.step()
+ if "ReduceLROnPlateau" in self.lr_scheduler.__class__.__name__:
+ self.lr_scheduler.step(logs["val_loss"])
+ else:
+ self.lr_scheduler.step()
def on_train_batch_end(self, batch: int, logs: Optional[Dict] = None) -> None:
"""Takes a step at the end of every training batch."""
diff --git a/src/training/trainer/callbacks/wandb_callbacks.py b/src/training/trainer/callbacks/wandb_callbacks.py
index f24e5cc..1627f17 100644
--- a/src/training/trainer/callbacks/wandb_callbacks.py
+++ b/src/training/trainer/callbacks/wandb_callbacks.py
@@ -111,7 +111,7 @@ class WandbImageLogger(Callback):
]
).rstrip("_")
else:
- ground_truth = self.targets[i]
+ ground_truth = self.model.mapper(int(self.targets[i]))
caption = f"Prediction: {pred} Confidence: {conf:.3f} Ground Truth: {ground_truth}"
images.append(wandb.Image(image, caption=caption))
diff --git a/src/training/trainer/population_based_training/__init__.py b/src/training/trainer/population_based_training/__init__.py
deleted file mode 100644
index 868d739..0000000
--- a/src/training/trainer/population_based_training/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""TBC."""
diff --git a/src/training/trainer/population_based_training/population_based_training.py b/src/training/trainer/population_based_training/population_based_training.py
deleted file mode 100644
index 868d739..0000000
--- a/src/training/trainer/population_based_training/population_based_training.py
+++ /dev/null
@@ -1 +0,0 @@
-"""TBC."""
diff --git a/src/training/trainer/train.py b/src/training/trainer/train.py
index fb49103..223d9c6 100644
--- a/src/training/trainer/train.py
+++ b/src/training/trainer/train.py
@@ -33,6 +33,7 @@ class Trainer:
max_epochs: int,
callbacks: List[Type[Callback]],
transformer_model: bool = False,
+ max_norm: float = 0.0,
) -> None:
"""Initialization of the Trainer.
@@ -40,6 +41,7 @@ class Trainer:
max_epochs (int): The maximum number of epochs in the training loop.
callbacks (CallbackList): List of callbacks to be called.
transformer_model (bool): Transformer model flag, modifies the input to the model. Default is False.
+ max_norm (float): Max norm for gradient clipping. Defaults to 0.0.
"""
# Training arguments.
@@ -52,6 +54,8 @@ class Trainer:
self.transformer_model = transformer_model
+ self.max_norm = max_norm
+
# Model placeholders
self.model = None
@@ -124,6 +128,11 @@ class Trainer:
# Compute the gradients.
loss.backward()
+ if self.max_norm > 0:
+ torch.nn.utils.clip_grad_norm_(
+ self.model.network.parameters(), self.max_norm
+ )
+
# Perform updates using calculated gradients.
self.model.optimizer.step()