diff options
Diffstat (limited to 'src/training')
-rw-r--r-- | src/training/prepare_experiments.py | 2 | ||||
-rw-r--r-- | src/training/run_experiment.py | 28 | ||||
-rw-r--r-- | src/training/trainer/callbacks/lr_schedulers.py | 5 | ||||
-rw-r--r-- | src/training/trainer/callbacks/wandb_callbacks.py | 2 | ||||
-rw-r--r-- | src/training/trainer/population_based_training/__init__.py | 1 | ||||
-rw-r--r-- | src/training/trainer/population_based_training/population_based_training.py | 1 | ||||
-rw-r--r-- | src/training/trainer/train.py | 9 |
7 files changed, 31 insertions, 17 deletions
diff --git a/src/training/prepare_experiments.py b/src/training/prepare_experiments.py index e00540c..6e20bcd 100644 --- a/src/training/prepare_experiments.py +++ b/src/training/prepare_experiments.py @@ -1,9 +1,7 @@ """Run a experiment from a config file.""" import json -from subprocess import run import click -from loguru import logger import yaml diff --git a/src/training/run_experiment.py b/src/training/run_experiment.py index c0f969d..0510d5c 100644 --- a/src/training/run_experiment.py +++ b/src/training/run_experiment.py @@ -73,7 +73,7 @@ def _create_experiment_dir( return experiment_dir, log_dir, model_dir -def _load_modules_and_arguments(experiment_config: Dict) -> Tuple[Callable, Dict]: +def _load_modules_and_arguments(experiment_config: Dict,) -> Tuple[Callable, Dict]: """Loads all modules and arguments.""" # Load the dataset module. dataset_args = experiment_config.get("dataset", {}) @@ -104,7 +104,7 @@ def _load_modules_and_arguments(experiment_config: Dict) -> Tuple[Callable, Dict criterion_ = getattr(custom_loss_module, experiment_config["criterion"]["type"]) else: criterion_ = getattr(torch.nn, experiment_config["criterion"]["type"]) - criterion_args = experiment_config["criterion"].get("args", {}) + criterion_args = experiment_config["criterion"].get("args", {}) or {} # Optimizers if experiment_config["optimizer"]["type"] == "AdaBelief": @@ -187,18 +187,20 @@ def _save_config(experiment_dir: Path, experiment_config: Dict) -> None: def _load_from_checkpoint( - model: Type[Model], log_dir: Path, model_dir: Path, pretrained_weights: str = None + model: Type[Model], model_dir: Path, pretrained_weights: str = None, ) -> None: """If checkpoint exists, load model weights and optimizers from checkpoint.""" # Get checkpoint path. if pretrained_weights is not None: logger.info(f"Loading weights from {pretrained_weights}.") - checkpoint_path = Path(pretrained_weights) / "model" / "best.pt" + checkpoint_path = ( + EXPERIMENTS_DIRNAME / Path(pretrained_weights) / "model" / "best.pt" + ) else: logger.info(f"Loading weights from {model_dir}.") checkpoint_path = model_dir / "last.pt" if checkpoint_path.exists(): - logger.info("Loading and resuming training from last checkpoint.") + logger.info("Loading and resuming training from checkpoint.") model.load_from_checkpoint(checkpoint_path) @@ -230,9 +232,9 @@ def run_experiment( experiment_config: Dict, save_weights: bool, device: str, - use_wandb: bool = False, - train: bool = True, - test: bool = False, + use_wandb: bool, + train: bool, + test: bool, verbose: int = 0, checkpoint: Optional[str] = None, pretrained_weights: Optional[str] = None, @@ -264,7 +266,7 @@ def run_experiment( resume = False if checkpoint is not None or pretrained_weights is not None: resume = True - _load_from_checkpoint(model, log_dir, model_dir, pretrained_weights) + _load_from_checkpoint(model, model_dir, pretrained_weights) logger.info(f"The class mapping is {model.mapping}") @@ -297,6 +299,7 @@ def run_experiment( max_epochs=experiment_config["train_args"]["max_epochs"], callbacks=callbacks, transformer_model=experiment_config["train_args"]["transformer_model"], + max_norm=experiment_config["train_args"]["max_norm"], ) # Train the model. @@ -309,7 +312,7 @@ def run_experiment( model.load_from_checkpoint(model_dir / "best.pt") logger.info("Running inference on test set.") - if experiment_config["criterion"]["type"] in custom_loss_module.__all__: + if experiment_config["criterion"]["type"] == "EmbeddingLoss": logger.info("Evaluating embedding.") score = evaluate_embedding(model) else: @@ -341,13 +344,15 @@ def run_experiment( @click.option( "--nowandb", is_flag=False, help="If true, do not use wandb for this run." ) -@click.option("--notrain", is_flag=False, help="Do not train the model.") @click.option("--test", is_flag=True, help="If true, test the model.") @click.option("-v", "--verbose", count=True) @click.option("--checkpoint", type=str, help="Path to the experiment.") @click.option( "--pretrained_weights", type=str, help="Path to pretrained model weights." ) +@click.option( + "--notrain", is_flag=False, is_eager=True, help="Do not train the model.", +) def run_cli( experiment_config: str, gpu: int, @@ -367,6 +372,7 @@ def run_cli( experiment_config = json.loads(experiment_config) os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu}" + run_experiment( experiment_config, save, diff --git a/src/training/trainer/callbacks/lr_schedulers.py b/src/training/trainer/callbacks/lr_schedulers.py index 907e292..630c434 100644 --- a/src/training/trainer/callbacks/lr_schedulers.py +++ b/src/training/trainer/callbacks/lr_schedulers.py @@ -22,7 +22,10 @@ class LRScheduler(Callback): def on_epoch_end(self, epoch: int, logs: Optional[Dict] = None) -> None: """Takes a step at the end of every epoch.""" if self.interval == "epoch": - self.lr_scheduler.step() + if "ReduceLROnPlateau" in self.lr_scheduler.__class__.__name__: + self.lr_scheduler.step(logs["val_loss"]) + else: + self.lr_scheduler.step() def on_train_batch_end(self, batch: int, logs: Optional[Dict] = None) -> None: """Takes a step at the end of every training batch.""" diff --git a/src/training/trainer/callbacks/wandb_callbacks.py b/src/training/trainer/callbacks/wandb_callbacks.py index f24e5cc..1627f17 100644 --- a/src/training/trainer/callbacks/wandb_callbacks.py +++ b/src/training/trainer/callbacks/wandb_callbacks.py @@ -111,7 +111,7 @@ class WandbImageLogger(Callback): ] ).rstrip("_") else: - ground_truth = self.targets[i] + ground_truth = self.model.mapper(int(self.targets[i])) caption = f"Prediction: {pred} Confidence: {conf:.3f} Ground Truth: {ground_truth}" images.append(wandb.Image(image, caption=caption)) diff --git a/src/training/trainer/population_based_training/__init__.py b/src/training/trainer/population_based_training/__init__.py deleted file mode 100644 index 868d739..0000000 --- a/src/training/trainer/population_based_training/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""TBC.""" diff --git a/src/training/trainer/population_based_training/population_based_training.py b/src/training/trainer/population_based_training/population_based_training.py deleted file mode 100644 index 868d739..0000000 --- a/src/training/trainer/population_based_training/population_based_training.py +++ /dev/null @@ -1 +0,0 @@ -"""TBC.""" diff --git a/src/training/trainer/train.py b/src/training/trainer/train.py index fb49103..223d9c6 100644 --- a/src/training/trainer/train.py +++ b/src/training/trainer/train.py @@ -33,6 +33,7 @@ class Trainer: max_epochs: int, callbacks: List[Type[Callback]], transformer_model: bool = False, + max_norm: float = 0.0, ) -> None: """Initialization of the Trainer. @@ -40,6 +41,7 @@ class Trainer: max_epochs (int): The maximum number of epochs in the training loop. callbacks (CallbackList): List of callbacks to be called. transformer_model (bool): Transformer model flag, modifies the input to the model. Default is False. + max_norm (float): Max norm for gradient clipping. Defaults to 0.0. """ # Training arguments. @@ -52,6 +54,8 @@ class Trainer: self.transformer_model = transformer_model + self.max_norm = max_norm + # Model placeholders self.model = None @@ -124,6 +128,11 @@ class Trainer: # Compute the gradients. loss.backward() + if self.max_norm > 0: + torch.nn.utils.clip_grad_norm_( + self.model.network.parameters(), self.max_norm + ) + # Perform updates using calculated gradients. self.model.optimizer.step() |