1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
"""Callback checkpoint for training models."""
from enum import Enum
from pathlib import Path
from typing import Callable, Dict, List, Optional, Type, Union
from loguru import logger
import numpy as np
import torch
from training.trainer.callbacks import Callback
from text_recognizer.models import Model
class Checkpoint(Callback):
"""Saving model parameters at the end of each epoch."""
mode_dict = {
"min": torch.lt,
"max": torch.gt,
}
def __init__(
self,
checkpoint_path: Union[str, Path],
monitor: str = "accuracy",
mode: str = "auto",
min_delta: float = 0.0,
) -> None:
"""Monitors a quantity that will allow us to determine the best model weights.
Args:
checkpoint_path (Union[str, Path]): Path to the experiment with the checkpoint.
monitor (str): Name of the quantity to monitor. Defaults to "accuracy".
mode (str): Description of parameter `mode`. Defaults to "auto".
min_delta (float): Description of parameter `min_delta`. Defaults to 0.0.
"""
super().__init__()
self.checkpoint_path = Path(checkpoint_path)
self.monitor = monitor
self.mode = mode
self.min_delta = torch.tensor(min_delta)
if mode not in ["auto", "min", "max"]:
logger.warning(f"Checkpoint mode {mode} is unkown, fallback to auto mode.")
self.mode = "auto"
if self.mode == "auto":
if "accuracy" in self.monitor:
self.mode = "max"
else:
self.mode = "min"
logger.debug(
f"Checkpoint mode set to {self.mode} for monitoring {self.monitor}."
)
torch_inf = torch.tensor(np.inf)
self.min_delta *= 1 if self.monitor_op == torch.gt else -1
self.best_score = torch_inf if self.monitor_op == torch.lt else -torch_inf
@property
def monitor_op(self) -> float:
"""Returns the comparison method."""
return self.mode_dict[self.mode]
def on_epoch_end(self, epoch: int, logs: Dict) -> None:
"""Saves a checkpoint for the network parameters.
Args:
epoch (int): The current epoch.
logs (Dict): The log containing the monitored metrics.
"""
current = self.get_monitor_value(logs)
if current is None:
return
if self.monitor_op(current - self.min_delta, self.best_score):
self.best_score = current
is_best = True
else:
is_best = False
self.model.save_checkpoint(self.checkpoint_path, is_best, epoch, self.monitor)
def get_monitor_value(self, logs: Dict) -> Union[float, None]:
"""Extracts the monitored value."""
monitor_value = logs.get(self.monitor)
if monitor_value is None:
logger.warning(
f"Checkpoint is conditioned on metric {self.monitor} which is not available. Available"
+ f" metrics are: {','.join(list(logs.keys()))}"
)
return None
return monitor_value
|