summaryrefslogtreecommitdiff
path: root/src/training
diff options
context:
space:
mode:
Diffstat (limited to 'src/training')
-rw-r--r--src/training/callbacks/__init__.py20
-rw-r--r--src/training/callbacks/base.py231
-rw-r--r--src/training/callbacks/early_stopping.py106
-rw-r--r--src/training/callbacks/lr_schedulers.py97
-rw-r--r--src/training/callbacks/wandb_callbacks.py93
-rw-r--r--src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/config.yml48
-rw-r--r--src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/best.ptbin14483400 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/last.ptbin14483400 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/config.yml48
-rw-r--r--src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/best.ptbin14483400 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/last.ptbin14483400 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_124928/config.yml43
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141139/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/best.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/last.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/best.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/last.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/best.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/last.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_145028/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_150212/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_150301/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_150317/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/best.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/last.ptbin1901268 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_151408/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_153144/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_153207/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/best.ptbin1702142 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/last.ptbin1702142 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/best.ptbin1702142 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/last.ptbin1702142 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/best.ptbin1702142 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/last.ptbin1702142 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/config.yml46
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/best.ptbin1702114 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/last.ptbin1702114 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/config.yml46
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/best.ptbin1702114 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/last.ptbin1702114 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/config.yml46
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/best.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/last.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/config.yml46
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/best.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/last.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/config.yml46
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/best.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/last.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191111/config.yml46
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/config.yml46
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/best.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/last.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/config.yml42
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/best.ptbin1135058 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/last.ptbin1135058 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/config.yml42
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/best.ptbin1135058 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/last.ptbin1135058 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/config.yml47
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/best.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/last.ptbin1702135 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/config.yml49
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/best.ptbin1702233 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/last.ptbin1702249 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_213125/config.yml49
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/config.yml49
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/best.ptbin1702233 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/last.ptbin1702233 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/config.yml49
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/best.ptbin1702233 -> 0 bytes
-rw-r--r--src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/last.ptbin1702233 -> 0 bytes
-rw-r--r--src/training/experiments/sample.yml43
-rw-r--r--src/training/experiments/sample_experiment.yml34
-rw-r--r--src/training/prepare_experiments.py18
-rw-r--r--src/training/run_experiment.py45
-rw-r--r--src/training/train.py237
84 files changed, 705 insertions, 1712 deletions
diff --git a/src/training/callbacks/__init__.py b/src/training/callbacks/__init__.py
index 868d739..fbcc285 100644
--- a/src/training/callbacks/__init__.py
+++ b/src/training/callbacks/__init__.py
@@ -1 +1,19 @@
-"""TBC."""
+"""The callback modules used in the training script."""
+from .base import Callback, CallbackList, Checkpoint
+from .early_stopping import EarlyStopping
+from .lr_schedulers import CyclicLR, MultiStepLR, OneCycleLR, ReduceLROnPlateau, StepLR
+from .wandb_callbacks import WandbCallback, WandbImageLogger
+
+__all__ = [
+ "Callback",
+ "CallbackList",
+ "Checkpoint",
+ "EarlyStopping",
+ "WandbCallback",
+ "WandbImageLogger",
+ "CyclicLR",
+ "MultiStepLR",
+ "OneCycleLR",
+ "ReduceLROnPlateau",
+ "StepLR",
+]
diff --git a/src/training/callbacks/base.py b/src/training/callbacks/base.py
index d80a1e5..e0d91e6 100644
--- a/src/training/callbacks/base.py
+++ b/src/training/callbacks/base.py
@@ -1,12 +1,33 @@
"""Metaclass for callback functions."""
-from abc import ABC
-from typing import Callable, List, Type
+from enum import Enum
+from typing import Callable, Dict, List, Type, Union
+from loguru import logger
+import numpy as np
+import torch
-class Callback(ABC):
+from text_recognizer.models import Model
+
+
+class ModeKeys:
+ """Mode keys for CallbackList."""
+
+ TRAIN = "train"
+ VALIDATION = "validation"
+
+
+class Callback:
"""Metaclass for callbacks used in training."""
+ def __init__(self) -> None:
+ """Initializes the Callback instance."""
+ self.model = None
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Set the model."""
+ self.model = model
+
def on_fit_begin(self) -> None:
"""Called when fit begins."""
pass
@@ -15,35 +36,27 @@ class Callback(ABC):
"""Called when fit ends."""
pass
- def on_train_epoch_begin(self) -> None:
- """Called at the beginning of an epoch."""
- pass
-
- def on_train_epoch_end(self) -> None:
- """Called at the end of an epoch."""
+ def on_epoch_begin(self, epoch: int, logs: Dict = {}) -> None:
+ """Called at the beginning of an epoch. Only used in training mode."""
pass
- def on_val_epoch_begin(self) -> None:
- """Called at the beginning of an epoch."""
+ def on_epoch_end(self, epoch: int, logs: Dict = {}) -> None:
+ """Called at the end of an epoch. Only used in training mode."""
pass
- def on_val_epoch_end(self) -> None:
- """Called at the end of an epoch."""
- pass
-
- def on_train_batch_begin(self) -> None:
+ def on_train_batch_begin(self, batch: int, logs: Dict = {}) -> None:
"""Called at the beginning of an epoch."""
pass
- def on_train_batch_end(self) -> None:
+ def on_train_batch_end(self, batch: int, logs: Dict = {}) -> None:
"""Called at the end of an epoch."""
pass
- def on_val_batch_begin(self) -> None:
+ def on_validation_batch_begin(self, batch: int, logs: Dict = {}) -> None:
"""Called at the beginning of an epoch."""
pass
- def on_val_batch_end(self) -> None:
+ def on_validation_batch_end(self, batch: int, logs: Dict = {}) -> None:
"""Called at the end of an epoch."""
pass
@@ -51,9 +64,29 @@ class Callback(ABC):
class CallbackList:
"""Container for abstracting away callback calls."""
- def __init__(self, callbacks: List[Callable] = None) -> None:
- """TBC."""
- self._callbacks = callbacks if callbacks is not None else []
+ mode_keys = ModeKeys()
+
+ def __init__(self, model: Type[Model], callbacks: List[Callback] = None) -> None:
+ """Container for `Callback` instances.
+
+ This object wraps a list of `Callback` instances and allows them all to be
+ called via a single end point.
+
+ Args:
+ model (Type[Model]): A `Model` instance.
+ callbacks (List[Callback]): List of `Callback` instances. Defaults to None.
+
+ """
+
+ self._callbacks = callbacks or []
+ if model:
+ self.set_model(model)
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Set the model for all callbacks."""
+ self.model = model
+ for callback in self._callbacks:
+ callback.set_model(model=self.model)
def append(self, callback: Type[Callback]) -> None:
"""Append new callback to callback list."""
@@ -61,41 +94,147 @@ class CallbackList:
def on_fit_begin(self) -> None:
"""Called when fit begins."""
- for _ in self._callbacks:
- pass
+ for callback in self._callbacks:
+ callback.on_fit_begin()
def on_fit_end(self) -> None:
"""Called when fit ends."""
- pass
+ for callback in self._callbacks:
+ callback.on_fit_end()
- def on_train_epoch_begin(self) -> None:
+ def on_epoch_begin(self, epoch: int, logs: Dict = {}) -> None:
"""Called at the beginning of an epoch."""
- pass
+ for callback in self._callbacks:
+ callback.on_epoch_begin(epoch, logs)
- def on_train_epoch_end(self) -> None:
+ def on_epoch_end(self, epoch: int, logs: Dict = {}) -> None:
"""Called at the end of an epoch."""
- pass
-
- def on_val_epoch_begin(self) -> None:
+ for callback in self._callbacks:
+ callback.on_epoch_end(epoch, logs)
+
+ def _call_batch_hook(
+ self, mode: str, hook: str, batch: int, logs: Dict = {}
+ ) -> None:
+ """Helper function for all batch_{begin | end} methods."""
+ if hook == "begin":
+ self._call_batch_begin_hook(mode, batch, logs)
+ elif hook == "end":
+ self._call_batch_end_hook(mode, batch, logs)
+ else:
+ raise ValueError(f"Unrecognized hook {hook}.")
+
+ def _call_batch_begin_hook(self, mode: str, batch: int, logs: Dict = {}) -> None:
+ """Helper function for all `on_*_batch_begin` methods."""
+ hook_name = f"on_{mode}_batch_begin"
+ self._call_batch_hook_helper(hook_name, batch, logs)
+
+ def _call_batch_end_hook(self, mode: str, batch: int, logs: Dict = {}) -> None:
+ """Helper function for all `on_*_batch_end` methods."""
+ hook_name = f"on_{mode}_batch_end"
+ self._call_batch_hook_helper(hook_name, batch, logs)
+
+ def _call_batch_hook_helper(
+ self, hook_name: str, batch: int, logs: Dict = {}
+ ) -> None:
+ """Helper function for `on_*_batch_begin` methods."""
+ for callback in self._callbacks:
+ hook = getattr(callback, hook_name)
+ hook(batch, logs)
+
+ def on_train_batch_begin(self, batch: int, logs: Dict = {}) -> None:
"""Called at the beginning of an epoch."""
- pass
+ self._call_batch_hook(self.mode_keys.TRAIN, "begin", batch)
- def on_val_epoch_end(self) -> None:
+ def on_train_batch_end(self, batch: int, logs: Dict = {}) -> None:
"""Called at the end of an epoch."""
- pass
+ self._call_batch_hook(self.mode_keys.TRAIN, "end", batch)
- def on_train_batch_begin(self) -> None:
+ def on_validation_batch_begin(self, batch: int, logs: Dict = {}) -> None:
"""Called at the beginning of an epoch."""
- pass
-
- def on_train_batch_end(self) -> None:
- """Called at the end of an epoch."""
- pass
+ self._call_batch_hook(self.mode_keys.VALIDATION, "begin", batch)
- def on_val_batch_begin(self) -> None:
- """Called at the beginning of an epoch."""
- pass
-
- def on_val_batch_end(self) -> None:
+ def on_validation_batch_end(self, batch: int, logs: Dict = {}) -> None:
"""Called at the end of an epoch."""
- pass
+ self._call_batch_hook(self.mode_keys.VALIDATION, "end", batch)
+
+ def __iter__(self) -> iter:
+ """Iter function for callback list."""
+ return iter(self._callbacks)
+
+
+class Checkpoint(Callback):
+ """Saving model parameters at the end of each epoch."""
+
+ mode_dict = {
+ "min": torch.lt,
+ "max": torch.gt,
+ }
+
+ def __init__(
+ self, monitor: str = "accuracy", mode: str = "auto", min_delta: float = 0.0
+ ) -> None:
+ """Monitors a quantity that will allow us to determine the best model weights.
+
+ Args:
+ monitor (str): Name of the quantity to monitor. Defaults to "accuracy".
+ mode (str): Description of parameter `mode`. Defaults to "auto".
+ min_delta (float): Description of parameter `min_delta`. Defaults to 0.0.
+
+ """
+ super().__init__()
+ self.monitor = monitor
+ self.mode = mode
+ self.min_delta = torch.tensor(min_delta)
+
+ if mode not in ["auto", "min", "max"]:
+ logger.warning(f"Checkpoint mode {mode} is unkown, fallback to auto mode.")
+
+ self.mode = "auto"
+
+ if self.mode == "auto":
+ if "accuracy" in self.monitor:
+ self.mode = "max"
+ else:
+ self.mode = "min"
+ logger.debug(
+ f"Checkpoint mode set to {self.mode} for monitoring {self.monitor}."
+ )
+
+ torch_inf = torch.tensor(np.inf)
+ self.min_delta *= 1 if self.monitor_op == torch.gt else -1
+ self.best_score = torch_inf if self.monitor_op == torch.lt else -torch_inf
+
+ @property
+ def monitor_op(self) -> float:
+ """Returns the comparison method."""
+ return self.mode_dict[self.mode]
+
+ def on_epoch_end(self, epoch: int, logs: Dict) -> None:
+ """Saves a checkpoint for the network parameters.
+
+ Args:
+ epoch (int): The current epoch.
+ logs (Dict): The log containing the monitored metrics.
+
+ """
+ current = self.get_monitor_value(logs)
+ if current is None:
+ return
+ if self.monitor_op(current - self.min_delta, self.best_score):
+ self.best_score = current
+ is_best = True
+ else:
+ is_best = False
+
+ self.model.save_checkpoint(is_best, epoch, self.monitor)
+
+ def get_monitor_value(self, logs: Dict) -> Union[float, None]:
+ """Extracts the monitored value."""
+ monitor_value = logs.get(self.monitor)
+ if monitor_value is None:
+ logger.warning(
+ f"Checkpoint is conditioned on metric {self.monitor} which is not available. Available"
+ + f"metrics are: {','.join(list(logs.keys()))}"
+ )
+ return None
+ return monitor_value
diff --git a/src/training/callbacks/early_stopping.py b/src/training/callbacks/early_stopping.py
index 4da0e85..c9b7907 100644
--- a/src/training/callbacks/early_stopping.py
+++ b/src/training/callbacks/early_stopping.py
@@ -1 +1,107 @@
"""Implements Early stopping for PyTorch model."""
+from typing import Dict, Union
+
+from loguru import logger
+import numpy as np
+import torch
+from training.callbacks import Callback
+
+
+class EarlyStopping(Callback):
+ """Stops training when a monitored metric stops improving."""
+
+ mode_dict = {
+ "min": torch.lt,
+ "max": torch.gt,
+ }
+
+ def __init__(
+ self,
+ monitor: str = "val_loss",
+ min_delta: float = 0.0,
+ patience: int = 3,
+ mode: str = "auto",
+ ) -> None:
+ """Initializes the EarlyStopping callback.
+
+ Args:
+ monitor (str): Description of parameter `monitor`. Defaults to "val_loss".
+ min_delta (float): Description of parameter `min_delta`. Defaults to 0.0.
+ patience (int): Description of parameter `patience`. Defaults to 3.
+ mode (str): Description of parameter `mode`. Defaults to "auto".
+
+ """
+ super().__init__()
+ self.monitor = monitor
+ self.patience = patience
+ self.min_delta = torch.tensor(min_delta)
+ self.mode = mode
+ self.wait_count = 0
+ self.stopped_epoch = 0
+
+ if mode not in ["auto", "min", "max"]:
+ logger.warning(
+ f"EarlyStopping mode {mode} is unkown, fallback to auto mode."
+ )
+
+ self.mode = "auto"
+
+ if self.mode == "auto":
+ if "accuracy" in self.monitor:
+ self.mode = "max"
+ else:
+ self.mode = "min"
+ logger.debug(
+ f"EarlyStopping mode set to {self.mode} for monitoring {self.monitor}."
+ )
+
+ self.torch_inf = torch.tensor(np.inf)
+ self.min_delta *= 1 if self.monitor_op == torch.gt else -1
+ self.best_score = (
+ self.torch_inf if self.monitor_op == torch.lt else -self.torch_inf
+ )
+
+ @property
+ def monitor_op(self) -> float:
+ """Returns the comparison method."""
+ return self.mode_dict[self.mode]
+
+ def on_fit_begin(self) -> Union[torch.lt, torch.gt]:
+ """Reset the early stopping variables for reuse."""
+ self.wait_count = 0
+ self.stopped_epoch = 0
+ self.best_score = (
+ self.torch_inf if self.monitor_op == torch.lt else -self.torch_inf
+ )
+
+ def on_epoch_end(self, epoch: int, logs: Dict) -> None:
+ """Computes the early stop criterion."""
+ current = self.get_monitor_value(logs)
+ if current is None:
+ return
+ if self.monitor_op(current - self.min_delta, self.best_score):
+ self.best_score = current
+ self.wait_count = 0
+ else:
+ self.wait_count += 1
+ if self.wait_count >= self.patience:
+ self.stopped_epoch = epoch
+ self.model.stop_training = True
+
+ def on_fit_end(self) -> None:
+ """Logs if early stopping was used."""
+ if self.stopped_epoch > 0:
+ logger.info(
+ f"Stopped training at epoch {self.stopped_epoch + 1} with early stopping."
+ )
+
+ def get_monitor_value(self, logs: Dict) -> Union[torch.Tensor, None]:
+ """Extracts the monitor value."""
+ monitor_value = logs.get(self.monitor)
+ if monitor_value is None:
+ logger.warning(
+ f"Early stopping is conditioned on metric {self.monitor} which is not available. Available"
+ + f"metrics are: {','.join(list(logs.keys()))}"
+ )
+ return None
+ return torch.tensor(monitor_value)
diff --git a/src/training/callbacks/lr_schedulers.py b/src/training/callbacks/lr_schedulers.py
new file mode 100644
index 0000000..00c7e9b
--- /dev/null
+++ b/src/training/callbacks/lr_schedulers.py
@@ -0,0 +1,97 @@
+"""Callbacks for learning rate schedulers."""
+from typing import Callable, Dict, List, Optional, Type
+
+from training.callbacks import Callback
+
+from text_recognizer.models import Model
+
+
+class StepLR(Callback):
+ """Callback for StepLR."""
+
+ def __init__(self) -> None:
+ """Initializes the callback."""
+ super().__init__()
+ self.lr_scheduler = None
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Sets the model and lr scheduler."""
+ self.model = model
+ self.lr_scheduler = self.model.lr_scheduler
+
+ def on_epoch_end(self, epoch: int, logs: Dict = {}) -> None:
+ """Takes a step at the end of every epoch."""
+ self.lr_scheduler.step()
+
+
+class MultiStepLR(Callback):
+ """Callback for MultiStepLR."""
+
+ def __init__(self) -> None:
+ """Initializes the callback."""
+ super().__init__()
+ self.lr_scheduler = None
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Sets the model and lr scheduler."""
+ self.model = model
+ self.lr_scheduler = self.model.lr_scheduler
+
+ def on_epoch_end(self, epoch: int, logs: Dict = {}) -> None:
+ """Takes a step at the end of every epoch."""
+ self.lr_scheduler.step()
+
+
+class ReduceLROnPlateau(Callback):
+ """Callback for ReduceLROnPlateau."""
+
+ def __init__(self) -> None:
+ """Initializes the callback."""
+ super().__init__()
+ self.lr_scheduler = None
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Sets the model and lr scheduler."""
+ self.model = model
+ self.lr_scheduler = self.model.lr_scheduler
+
+ def on_epoch_end(self, epoch: int, logs: Dict = {}) -> None:
+ """Takes a step at the end of every epoch."""
+ val_loss = logs["val_loss"]
+ self.lr_scheduler.step(val_loss)
+
+
+class CyclicLR(Callback):
+ """Callback for CyclicLR."""
+
+ def __init__(self) -> None:
+ """Initializes the callback."""
+ super().__init__()
+ self.lr_scheduler = None
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Sets the model and lr scheduler."""
+ self.model = model
+ self.lr_scheduler = self.model.lr_scheduler
+
+ def on_train_batch_end(self, batch: int, logs: Dict = {}) -> None:
+ """Takes a step at the end of every training batch."""
+ self.lr_scheduler.step()
+
+
+class OneCycleLR(Callback):
+ """Callback for OneCycleLR."""
+
+ def __init__(self) -> None:
+ """Initializes the callback."""
+ super().__init__()
+ self.lr_scheduler = None
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Sets the model and lr scheduler."""
+ self.model = model
+ self.lr_scheduler = self.model.lr_scheduler
+
+ def on_train_batch_end(self, batch: int, logs: Dict = {}) -> None:
+ """Takes a step at the end of every training batch."""
+ self.lr_scheduler.step()
diff --git a/src/training/callbacks/wandb_callbacks.py b/src/training/callbacks/wandb_callbacks.py
new file mode 100644
index 0000000..f64cbe1
--- /dev/null
+++ b/src/training/callbacks/wandb_callbacks.py
@@ -0,0 +1,93 @@
+"""Callbacks using wandb."""
+from typing import Callable, Dict, List, Optional, Type
+
+import numpy as np
+from torchvision.transforms import Compose, ToTensor
+from training.callbacks import Callback
+import wandb
+
+from text_recognizer.datasets import Transpose
+from text_recognizer.models.base import Model
+
+
+class WandbCallback(Callback):
+ """A custom W&B metric logger for the trainer."""
+
+ def __init__(self, log_batch_frequency: int = None) -> None:
+ """Short summary.
+
+ Args:
+ log_batch_frequency (int): If None, metrics will be logged every epoch.
+ If set to an integer, callback will log every metrics every log_batch_frequency.
+
+ """
+ super().__init__()
+ self.log_batch_frequency = log_batch_frequency
+
+ def _on_batch_end(self, batch: int, logs: Dict) -> None:
+ if self.log_batch_frequency and batch % self.log_batch_frequency == 0:
+ wandb.log(logs, commit=True)
+
+ def on_train_batch_end(self, batch: int, logs: Dict = {}) -> None:
+ """Logs training metrics."""
+ if logs is not None:
+ self._on_batch_end(batch, logs)
+
+ def on_validation_batch_end(self, batch: int, logs: Dict = {}) -> None:
+ """Logs validation metrics."""
+ if logs is not None:
+ self._on_batch_end(batch, logs)
+
+ def on_epoch_end(self, epoch: int, logs: Dict) -> None:
+ """Logs at epoch end."""
+ wandb.log(logs, commit=True)
+
+
+class WandbImageLogger(Callback):
+ """Custom W&B callback for image logging."""
+
+ def __init__(
+ self,
+ example_indices: Optional[List] = None,
+ num_examples: int = 4,
+ transfroms: Optional[Callable] = None,
+ ) -> None:
+ """Initializes the WandbImageLogger with the model to train.
+
+ Args:
+ example_indices (Optional[List]): Indices for validation images. Defaults to None.
+ num_examples (int): Number of random samples to take if example_indices are not specified. Defaults to 4.
+ transfroms (Optional[Callable]): Transforms to use on the validation images, e.g. transpose. Defaults to
+ None.
+
+ """
+
+ super().__init__()
+ self.example_indices = example_indices
+ self.num_examples = num_examples
+ self.transfroms = transfroms
+ if self.transfroms is None:
+ self.transforms = Compose([Transpose()])
+
+ def set_model(self, model: Type[Model]) -> None:
+ """Sets the model and extracts validation images from the dataset."""
+ self.model = model
+ data_loader = self.model.data_loaders("val")
+ if self.example_indices is None:
+ self.example_indices = np.random.randint(
+ 0, len(data_loader.dataset.data), self.num_examples
+ )
+ self.val_images = data_loader.dataset.data[self.example_indices]
+ self.val_targets = data_loader.dataset.targets[self.example_indices].numpy()
+
+ def on_epoch_end(self, epoch: int, logs: Dict) -> None:
+ """Get network predictions on validation images."""
+ images = []
+ for i, image in enumerate(self.val_images):
+ image = self.transforms(image)
+ pred, conf = self.model.predict_on_image(image)
+ ground_truth = self.model._mapping[self.val_targets[i]]
+ caption = f"Prediction: {pred} Confidence: {conf:.3f} Ground Truth: {ground_truth}"
+ images.append(wandb.Image(image, caption=caption))
+
+ wandb.log({"examples": images}, commit=False)
diff --git a/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/config.yml b/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/config.yml
deleted file mode 100644
index 2595325..0000000
--- a/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/config.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 8
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: LeNet
-network_args:
- input_size:
- - 28
- - 28
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/best.pt b/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/best.pt
deleted file mode 100644
index 6d78bad..0000000
--- a/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/last.pt b/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/last.pt
deleted file mode 100644
index 6d78bad..0000000
--- a/src/training/experiments/CharacterModel_Emnist_LeNet/0721_231455/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/config.yml b/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/config.yml
deleted file mode 100644
index 2595325..0000000
--- a/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/config.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 8
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: LeNet
-network_args:
- input_size:
- - 28
- - 28
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/best.pt b/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/best.pt
deleted file mode 100644
index 43a3891..0000000
--- a/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/last.pt b/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/last.pt
deleted file mode 100644
index 61c03f0..0000000
--- a/src/training/experiments/CharacterModel_Emnist_LeNet/0722_190746/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_124928/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_124928/config.yml
deleted file mode 100644
index 2aa52cd..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_124928/config.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: null
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.001
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141139/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141139/config.yml
deleted file mode 100644
index 829297d..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141139/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.0003
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.0006
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/config.yml
deleted file mode 100644
index 829297d..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.0003
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.0006
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/best.pt
deleted file mode 100644
index d0db78b..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/last.pt
deleted file mode 100644
index d0db78b..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141213/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/config.yml
deleted file mode 100644
index 3df32bb..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.01
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.1
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/best.pt
deleted file mode 100644
index 5914c8f..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/last.pt
deleted file mode 100644
index 5ba44bb..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141433/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/best.pt
deleted file mode 100644
index 96c21c1..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/last.pt
deleted file mode 100644
index f024c0d..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_141702/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_145028/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_145028/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_145028/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_150212/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_150212/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_150212/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_150301/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_150301/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_150301/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_150317/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_150317/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_150317/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/best.pt
deleted file mode 100644
index f833a89..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/last.pt
deleted file mode 100644
index f833a89..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151135/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151408/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_151408/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_151408/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153144/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_153144/config.yml
deleted file mode 100644
index 829297d..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153144/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.0003
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.0006
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153207/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_153207/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153207/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/best.pt
deleted file mode 100644
index cbbc5e1..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/last.pt
deleted file mode 100644
index cbbc5e1..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_153310/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/config.yml
deleted file mode 100644
index fb75736..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/best.pt
deleted file mode 100644
index c93e3c6..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/last.pt
deleted file mode 100644
index c93e3c6..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_175150/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/config.yml
deleted file mode 100644
index 1be5113..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: Adam
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 5.0e-05
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/best.pt
deleted file mode 100644
index 580bad2..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/last.pt
deleted file mode 100644
index 97e245c..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_180741/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/config.yml
deleted file mode 100644
index d2f98a2..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/config.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: Adamax
-optimizer_args:
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/best.pt
deleted file mode 100644
index 5a3df56..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/last.pt
deleted file mode 100644
index 7f28dc3..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_181933/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/config.yml
deleted file mode 100644
index d2f98a2..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/config.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: Adamax
-optimizer_args:
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/best.pt
deleted file mode 100644
index 6f09780..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/last.pt
deleted file mode 100644
index 3bb103e..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_183347/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/config.yml
deleted file mode 100644
index a7c66c5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/config.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/best.pt
deleted file mode 100644
index c3e3618..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/last.pt
deleted file mode 100644
index c3e3618..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190044/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/config.yml
deleted file mode 100644
index a7c66c5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/config.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/best.pt
deleted file mode 100644
index 44d9b9b..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/last.pt
deleted file mode 100644
index 44d9b9b..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190633/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/config.yml
deleted file mode 100644
index a7c66c5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/config.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/best.pt
deleted file mode 100644
index 4a0333c..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/last.pt
deleted file mode 100644
index 4a0333c..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_190738/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191111/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191111/config.yml
deleted file mode 100644
index a7c66c5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191111/config.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 0
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/config.yml
deleted file mode 100644
index 08c344c..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/config.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 1
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/best.pt
deleted file mode 100644
index 076aae1..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/last.pt
deleted file mode 100644
index 076aae1..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191310/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/config.yml
deleted file mode 100644
index 0b9b10e..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/config.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 1
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: null
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: RMSprop
-optimizer_args:
- alpha: 0.9
- centered: false
- eps: 1.0e-07
- lr: 0.001
- momentum: 0
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/best.pt
deleted file mode 100644
index 2fb0195..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/last.pt
deleted file mode 100644
index 2fb0195..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191412/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/config.yml
deleted file mode 100644
index 93c2854..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/config.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 4
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: null
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: RMSprop
-optimizer_args:
- alpha: 0.9
- centered: false
- eps: 1.0e-07
- lr: 0.001
- momentum: 0
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/best.pt
deleted file mode 100644
index 9acc5b1..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/last.pt
deleted file mode 100644
index b8cc01c..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191504/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/config.yml
deleted file mode 100644
index 7340941..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/config.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 8
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/best.pt
deleted file mode 100644
index 26bfb07..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/last.pt
deleted file mode 100644
index 26bfb07..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0721_191826/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/config.yml
deleted file mode 100644
index 90f0e13..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/config.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 8
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 33
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-resume_experiment: last
-train_args:
- batch_size: 256
- epochs: 33
- val_metric: accuracy
-verbosity: 1
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/best.pt
deleted file mode 100644
index f0f297b..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/last.pt
deleted file mode 100644
index c1adda5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_191559/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213125/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0722_213125/config.yml
deleted file mode 100644
index 8d77de5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213125/config.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 8
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-resume_experiment: null
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
-verbosity: 2
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/config.yml
deleted file mode 100644
index 8d77de5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/config.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 8
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-resume_experiment: null
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
-verbosity: 2
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/best.pt
deleted file mode 100644
index e985997..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/last.pt
deleted file mode 100644
index e985997..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213413/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/config.yml b/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/config.yml
deleted file mode 100644
index 8d77de5..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/config.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-criterion: CrossEntropyLoss
-criterion_args:
- ignore_index: -100
- reduction: mean
- weight: null
-data_loader_args:
- batch_size: 256
- cuda: true
- num_workers: 8
- sample_to_balance: true
- seed: 4711
- shuffle: true
- splits:
- - train
- - val
- subsample_fraction: null
- target_transform: null
- transform: null
-dataloader: EmnistDataLoader
-device: cuda:0
-experiment_group: Sample Experiments
-lr_scheduler: OneCycleLR
-lr_scheduler_args:
- epochs: 16
- max_lr: 0.001
- steps_per_epoch: 1314
-metrics:
-- accuracy
-model: CharacterModel
-network: MLP
-network_args:
- input_size: 784
- num_layers: 3
- output_size: 62
-optimizer: AdamW
-optimizer_args:
- amsgrad: false
- betas:
- - 0.9
- - 0.999
- eps: 1.0e-08
- lr: 0.01
- weight_decay: 0
-resume_experiment: null
-train_args:
- batch_size: 256
- epochs: 16
- val_metric: accuracy
-verbosity: 2
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/best.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/best.pt
deleted file mode 100644
index 0dde787..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/best.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/last.pt b/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/last.pt
deleted file mode 100644
index e02738b..0000000
--- a/src/training/experiments/CharacterModel_Emnist_MLP/0722_213549/model/last.pt
+++ /dev/null
Binary files differ
diff --git a/src/training/experiments/sample.yml b/src/training/experiments/sample.yml
deleted file mode 100644
index 0ed560d..0000000
--- a/src/training/experiments/sample.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-experiment_group: Sample Experiments
-experiments:
- - dataloader: EmnistDataLoader
- model: CharacterModel
- metrics: [accuracy]
- network: MLP
- network_args:
- input_shape: 784
- num_layers: 2
- train_args:
- batch_size: 256
- epochs: 16
- criterion: CrossEntropyLoss
- criterion_args:
- weight: null
- ignore_index: -100
- reduction: mean
- optimizer: AdamW
- optimizer_args:
- lr: 3.e-4
- betas: [0.9, 0.999]
- eps: 1.e-08
- weight_decay: 0
- amsgrad: false
- lr_scheduler: OneCycleLR
- lr_scheduler_args:
- max_lr: 3.e-5
- epochs: 16
- # - dataloader: EmnistDataLoader
- # model: CharacterModel
- # network: MLP
- # network_args:
- # input_shape: 784
- # num_layers: 4
- # train_args:
- # batch_size: 256
- # - dataloader: EmnistDataLoader
- # model: CharacterModel
- # network: LeNet
- # network_args:
- # input_shape: [28, 28]
- # train_args:
- # batch_size: 256
diff --git a/src/training/experiments/sample_experiment.yml b/src/training/experiments/sample_experiment.yml
index e8d5023..70edb63 100644
--- a/src/training/experiments/sample_experiment.yml
+++ b/src/training/experiments/sample_experiment.yml
@@ -1,6 +1,6 @@
experiment_group: Sample Experiments
experiments:
- - dataloader: EmnistDataLoader
+ - dataloader: EmnistDataLoaders
data_loader_args:
splits: [train, val]
sample_to_balance: true
@@ -14,19 +14,18 @@ experiments:
seed: 4711
model: CharacterModel
metrics: [accuracy]
- network: MLP
- network_args:
- input_size: 784
- output_size: 62
- num_layers: 3
- # network: LeNet
+ # network: MLP
# network_args:
- # input_size: [28, 28]
+ # input_size: 784
# output_size: 62
+ # num_layers: 3
+ network: LeNet
+ network_args:
+ input_size: [28, 28]
+ output_size: 62
train_args:
batch_size: 256
epochs: 16
- val_metric: accuracy
criterion: CrossEntropyLoss
criterion_args:
weight: null
@@ -52,5 +51,20 @@ experiments:
lr_scheduler_args:
max_lr: 1.e-3
epochs: 16
- verbosity: 2 # 0, 1, 2
+ callbacks: [Checkpoint, EarlyStopping, WandbCallback, WandbImageLogger, OneCycleLR]
+ callback_args:
+ Checkpoint:
+ monitor: val_accuracy
+ EarlyStopping:
+ monitor: val_loss
+ min_delta: 0.0
+ patience: 3
+ mode: min
+ WandbCallback:
+ log_batch_frequency: 10
+ WandbImageLogger:
+ num_examples: 4
+ OneCycleLR:
+ null
+ verbosity: 1 # 0, 1, 2
resume_experiment: null
diff --git a/src/training/prepare_experiments.py b/src/training/prepare_experiments.py
index eb872d7..5a665b3 100644
--- a/src/training/prepare_experiments.py
+++ b/src/training/prepare_experiments.py
@@ -1,12 +1,13 @@
"""Run a experiment from a config file."""
import json
-from subprocess import check_call
+from subprocess import run
import click
from loguru import logger
import yaml
+# flake8: noqa: S404,S607,S603
def run_experiments(experiments_filename: str) -> None:
"""Run experiment from file."""
with open(experiments_filename) as f:
@@ -15,10 +16,19 @@ def run_experiments(experiments_filename: str) -> None:
for index in range(num_experiments):
experiment_config = experiments_config["experiments"][index]
experiment_config["experiment_group"] = experiments_config["experiment_group"]
- # cmd = f"python training/run_experiment.py --gpu=-1 '{json.dumps(experiment_config)}'"
- cmd = f"poetry run run-experiment --gpu=-1 --save --experiment_config '{json.dumps(experiment_config)}'"
+ cmd = f"poetry run run-experiment --gpu=-1 --save --experiment_config={json.dumps(experiment_config)}"
print(cmd)
- check_call(cmd, shell=True)
+ run(
+ [
+ "poetry",
+ "run",
+ "run-experiment",
+ "--gpu=-1",
+ "--save",
+ f"--experiment_config={json.dumps(experiment_config)}",
+ ],
+ check=True,
+ )
@click.command()
diff --git a/src/training/run_experiment.py b/src/training/run_experiment.py
index 0b29ce9..c133ce5 100644
--- a/src/training/run_experiment.py
+++ b/src/training/run_experiment.py
@@ -12,8 +12,10 @@ import click
from loguru import logger
import torch
from tqdm import tqdm
+from training.callbacks import CallbackList
from training.gpu_manager import GPUManager
from training.train import Trainer
+import wandb
import yaml
@@ -48,9 +50,8 @@ def create_experiment_dir(model: Callable, experiment_config: Dict) -> Path:
logger.debug(f"Resuming the latest experiment {experiment}")
else:
experiment = experiment_config["resume_experiment"]
- assert (
- str(experiment_dir / experiment) in available_experiments
- ), "Experiment does not exist."
+ if not str(experiment_dir / experiment) in available_experiments:
+ raise FileNotFoundError("Experiment does not exist.")
logger.debug(f"Resuming the experiment {experiment}")
experiment_dir = experiment_dir / experiment
@@ -87,6 +88,13 @@ def load_modules_and_arguments(experiment_config: Dict) -> Tuple[Callable, Dict]
optimizer_ = getattr(torch.optim, experiment_config["optimizer"])
optimizer_args = experiment_config.get("optimizer_args", {})
+ # Callbacks
+ callback_modules = importlib.import_module("training.callbacks")
+ callbacks = []
+ for callback in experiment_config["callbacks"]:
+ args = experiment_config["callback_args"][callback] or {}
+ callbacks.append(getattr(callback_modules, callback)(**args))
+
# Learning rate scheduler
if experiment_config["lr_scheduler"] is not None:
lr_scheduler_ = getattr(
@@ -111,7 +119,7 @@ def load_modules_and_arguments(experiment_config: Dict) -> Tuple[Callable, Dict]
"lr_scheduler_args": lr_scheduler_args,
}
- return model_class_, model_args
+ return model_class_, model_args, callbacks
def run_experiment(
@@ -120,11 +128,14 @@ def run_experiment(
"""Runs an experiment."""
# Load the modules and model arguments.
- model_class_, model_args = load_modules_and_arguments(experiment_config)
+ model_class_, model_args, callbacks = load_modules_and_arguments(experiment_config)
# Initializes the model with experiment config.
model = model_class_(**model_args, device=device)
+ # Instantiate a CallbackList.
+ callbacks = CallbackList(model, callbacks)
+
# Create new experiment.
experiment_dir = create_experiment_dir(model, experiment_config)
@@ -132,6 +143,9 @@ def run_experiment(
log_dir = experiment_dir / "log"
model_dir = experiment_dir / "model"
+ # Set the model dir to be able to save checkpoints.
+ model.model_dir = model_dir
+
# Get checkpoint path.
checkpoint_path = model_dir / "last.pt"
if not checkpoint_path.exists():
@@ -162,6 +176,13 @@ def run_experiment(
logger.info(f"The class mapping is {model.mapping}")
+ # Initializes Weights & Biases
+ if use_wandb:
+ wandb.init(project="text-recognizer", config=experiment_config)
+
+ # Lets W&B save the model and track the gradients and optional parameters.
+ wandb.watch(model.network)
+
# PÅ•ints a summary of the network in terminal.
model.summary()
@@ -181,21 +202,26 @@ def run_experiment(
with open(str(config_path), "w") as f:
yaml.dump(experiment_config, f)
- # TODO: wandb
trainer = Trainer(
model=model,
model_dir=model_dir,
- epochs=experiment_config["train_args"]["epochs"],
- val_metric=experiment_config["train_args"]["val_metric"],
+ train_args=experiment_config["train_args"],
+ callbacks=callbacks,
checkpoint_path=checkpoint_path,
)
trainer.fit()
+ logger.info("Loading checkpoint with the best weights.")
+ model.load_checkpoint(model_dir / "best.pt")
+
score = trainer.validate()
logger.info(f"Validation set evaluation: {score}")
+ if use_wandb:
+ wandb.log({"validation_metric": score["val_accuracy"]})
+
if save_weights:
model.save_weights(model_dir)
@@ -220,12 +246,11 @@ def main(experiment_config: str, gpu: int, save: bool, nowandb: bool) -> None:
if gpu < 0:
gpu_manager = GPUManager(True)
gpu = gpu_manager.get_free_gpu()
-
device = "cuda:" + str(gpu)
experiment_config = json.loads(experiment_config)
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu}"
- run_experiment(experiment_config, save, device, nowandb)
+ run_experiment(experiment_config, save, device, use_wandb=not nowandb)
if __name__ == "__main__":
diff --git a/src/training/train.py b/src/training/train.py
index 8cd5110..3334c2e 100644
--- a/src/training/train.py
+++ b/src/training/train.py
@@ -2,17 +2,19 @@
from pathlib import Path
import time
-from typing import Dict, Optional, Type
+from typing import Dict, List, Optional, Tuple, Type
from loguru import logger
import numpy as np
import torch
from tqdm import tqdm, trange
+from training.callbacks import Callback, CallbackList
from training.util import RunningAverage
import wandb
from text_recognizer.models import Model
+
torch.backends.cudnn.benchmark = True
np.random.seed(4711)
torch.manual_seed(4711)
@@ -22,51 +24,82 @@ torch.cuda.manual_seed(4711)
class Trainer:
"""Trainer for training PyTorch models."""
- # TODO implement wandb.
- # TODO implement Bayesian parameter search.
-
def __init__(
self,
model: Type[Model],
model_dir: Path,
- epochs: int,
- val_metric: str = "accuracy",
+ train_args: Dict,
+ callbacks: CallbackList,
checkpoint_path: Optional[Path] = None,
- use_wandb: Optional[bool] = False,
) -> None:
"""Initialization of the Trainer.
Args:
model (Type[Model]): A model object.
model_dir (Path): Path to the model directory.
- epochs (int): Number of epochs to train.
- val_metric (str): The validation metric to evaluate the model on. Defaults to "accuracy".
+ train_args (Dict): The training arguments.
+ callbacks (CallbackList): List of callbacks to be called.
checkpoint_path (Optional[Path]): The path to a previously trained model. Defaults to None.
- use_wandb (Optional[bool]): Sync training to wandb.
"""
self.model = model
self.model_dir = model_dir
- self.epochs = epochs
self.checkpoint_path = checkpoint_path
- self.start_epoch = 0
+ self.start_epoch = 1
+ self.epochs = train_args["epochs"] + self.start_epoch
+ self.callbacks = callbacks
if self.checkpoint_path is not None:
- self.start_epoch = self.model.load_checkpoint(self.checkpoint_path)
-
- if use_wandb:
- # TODO implement wandb logging.
- pass
-
- self.val_metric = val_metric
- self.best_val_metric = 0.0
+ self.start_epoch = self.model.load_checkpoint(self.checkpoint_path) + 1
# Parse the name of the experiment.
experiment_dir = str(self.model_dir.parents[1]).split("/")
self.experiment_name = experiment_dir[-2] + "/" + experiment_dir[-1]
+ def training_step(
+ self,
+ batch: int,
+ samples: Tuple[torch.Tensor, torch.Tensor],
+ loss_avg: Type[RunningAverage],
+ ) -> Dict:
+ """Performs the training step."""
+ # Pass the tensor to the device for computation.
+ data, targets = samples
+ data, targets = (
+ data.to(self.model.device),
+ targets.to(self.model.device),
+ )
+
+ # Forward pass.
+ # Get the network prediction.
+ output = self.model.network(data)
+
+ # Compute the loss.
+ loss = self.model.criterion(output, targets)
+
+ # Backward pass.
+ # Clear the previous gradients.
+ self.model.optimizer.zero_grad()
+
+ # Compute the gradients.
+ loss.backward()
+
+ # Perform updates using calculated gradients.
+ self.model.optimizer.step()
+
+ # Compute metrics.
+ loss_avg.update(loss.item())
+ output = output.data.cpu()
+ targets = targets.data.cpu()
+ metrics = {
+ metric: self.model.metrics[metric](output, targets)
+ for metric in self.model.metrics
+ }
+ metrics["loss"] = loss_avg()
+ return metrics
+
def train(self) -> None:
- """Training loop."""
+ """Runs the training loop for one epoch."""
# Set model to traning mode.
self.model.train()
@@ -79,57 +112,54 @@ class Trainer:
total=len(data_loader),
leave=False,
unit="step",
- bar_format="{n_fmt}/{total_fmt} |{bar:20}| {remaining} {rate_inv_fmt}{postfix}",
+ bar_format="{n_fmt}/{total_fmt} |{bar:30}| {remaining} {rate_inv_fmt}{postfix}",
) as t:
- for data, targets in data_loader:
+ for batch, samples in enumerate(data_loader):
+ self.callbacks.on_train_batch_begin(batch)
- data, targets = (
- data.to(self.model.device),
- targets.to(self.model.device),
- )
+ metrics = self.training_step(batch, samples, loss_avg)
- # Forward pass.
- # Get the network prediction.
- output = self.model.network(data)
-
- # Compute the loss.
- loss = self.model.criterion(output, targets)
-
- # Backward pass.
- # Clear the previous gradients.
- self.model.optimizer.zero_grad()
-
- # Compute the gradients.
- loss.backward()
-
- # Perform updates using calculated gradients.
- self.model.optimizer.step()
-
- # Compute metrics.
- loss_avg.update(loss.item())
- output = output.data.cpu()
- targets = targets.data.cpu()
- metrics = {
- metric: self.model.metrics[metric](output, targets)
- for metric in self.model.metrics
- }
- metrics["loss"] = loss_avg()
+ self.callbacks.on_train_batch_end(batch, logs=metrics)
# Update Tqdm progress bar.
t.set_postfix(**metrics)
t.update()
- # If the model has a learning rate scheduler, compute a step.
- if self.model.lr_scheduler is not None:
- self.model.lr_scheduler.step()
-
- def validate(self) -> Dict:
- """Evaluation loop.
+ def validation_step(
+ self,
+ batch: int,
+ samples: Tuple[torch.Tensor, torch.Tensor],
+ loss_avg: Type[RunningAverage],
+ ) -> Dict:
+ """Performs the validation step."""
+ # Pass the tensor to the device for computation.
+ data, targets = samples
+ data, targets = (
+ data.to(self.model.device),
+ targets.to(self.model.device),
+ )
+
+ # Forward pass.
+ # Get the network prediction.
+ output = self.model.network(data)
+
+ # Compute the loss.
+ loss = self.model.criterion(output, targets)
+
+ # Compute metrics.
+ loss_avg.update(loss.item())
+ output = output.data.cpu()
+ targets = targets.data.cpu()
+ metrics = {
+ metric: self.model.metrics[metric](output, targets)
+ for metric in self.model.metrics
+ }
+ metrics["loss"] = loss.item()
- Returns:
- Dict: A dictionary of evaluation metrics.
+ return metrics
- """
+ def validate(self, epoch: Optional[int] = None) -> Dict:
+ """Runs the validation loop for one epoch."""
# Set model to eval mode.
self.model.eval()
@@ -146,44 +176,37 @@ class Trainer:
total=len(data_loader),
leave=False,
unit="step",
- bar_format="{n_fmt}/{total_fmt} |{bar:20}| {remaining} {rate_inv_fmt}{postfix}",
+ bar_format="{n_fmt}/{total_fmt} |{bar:30}| {remaining} {rate_inv_fmt}{postfix}",
) as t:
- for data, targets in data_loader:
- data, targets = (
- data.to(self.model.device),
- targets.to(self.model.device),
- )
-
- with torch.no_grad():
- # Forward pass.
- # Get the network prediction.
- output = self.model.network(data)
-
- # Compute the loss.
- loss = self.model.criterion(output, targets)
-
- # Compute metrics.
- loss_avg.update(loss.item())
- output = output.data.cpu()
- targets = targets.data.cpu()
- metrics = {
- metric: self.model.metrics[metric](output, targets)
- for metric in self.model.metrics
- }
- metrics["loss"] = loss.item()
-
- summary.append(metrics)
+ with torch.no_grad():
+ for batch, samples in enumerate(data_loader):
+ self.callbacks.on_validation_batch_begin(batch)
- # Update Tqdm progress bar.
- t.set_postfix(**metrics)
- t.update()
+ metrics = self.validation_step(batch, samples, loss_avg)
+
+ self.callbacks.on_validation_batch_end(batch, logs=metrics)
+
+ summary.append(metrics)
+
+ # Update Tqdm progress bar.
+ t.set_postfix(**metrics)
+ t.update()
# Compute mean of all metrics.
metrics_mean = {
- metric: np.mean([x[metric] for x in summary]) for metric in summary[0]
+ "val_" + metric: np.mean([x[metric] for x in summary])
+ for metric in summary[0]
}
- metrics_str = " - ".join(f"{k}: {v}" for k, v in metrics_mean.items())
- logger.debug(metrics_str)
+ if epoch:
+ logger.debug(
+ f"Validation metrics at epoch {epoch} - "
+ + " - ".join(f"{k}: {v:.4f}" for k, v in metrics_mean.items())
+ )
+ else:
+ logger.debug(
+ "Validation metrics - "
+ + " - ".join(f"{k}: {v:.4f}" for k, v in metrics_mean.items())
+ )
return metrics_mean
@@ -192,31 +215,35 @@ class Trainer:
logger.debug(f"Running an experiment called {self.experiment_name}.")
t_start = time.time()
+
+ self.callbacks.on_fit_begin()
+
+ # TODO: fix progress bar as callback.
# Run the training loop.
for epoch in trange(
+ self.start_epoch,
self.epochs,
- initial=self.start_epoch,
leave=False,
- bar_format="{desc}: {n_fmt}/{total_fmt} |{bar:10}| {remaining}{postfix}",
+ bar_format="{desc}: {n_fmt}/{total_fmt} |{bar:30}| {remaining}{postfix}",
desc="Epoch",
):
+ self.callbacks.on_epoch_begin(epoch)
+
# Perform one training pass over the training set.
self.train()
# Evaluate the model on the validation set.
- val_metrics = self.validate()
+ val_metrics = self.validate(epoch)
- # The validation metric to evaluate the model on, e.g. accuracy.
- val_metric = val_metrics[self.val_metric]
- is_best = val_metric >= self.best_val_metric
- self.best_val_metric = val_metric if is_best else self.best_val_metric
- # Save checkpoint.
- self.model.save_checkpoint(self.model_dir, is_best, epoch, self.val_metric)
+ self.callbacks.on_epoch_end(epoch, logs=val_metrics)
- if self.start_epoch > 0 and epoch + self.start_epoch == self.epochs:
- logger.debug(f"Trained the model for {self.epochs} number of epochs.")
+ if self.model.stop_training:
break
+ # Calculate the total training time.
t_end = time.time()
t_training = t_end - t_start
+
+ self.callbacks.on_fit_end()
+
logger.info(f"Training took {t_training:.2f} s.")