diff options
Diffstat (limited to 'src/text_recognizer/networks')
-rw-r--r-- | src/text_recognizer/networks/__init__.py | 2 | ||||
-rw-r--r-- | src/text_recognizer/networks/sparse_mlp.py | 78 | ||||
-rw-r--r-- | src/text_recognizer/networks/unet.py | 158 |
3 files changed, 158 insertions, 80 deletions
diff --git a/src/text_recognizer/networks/__init__.py b/src/text_recognizer/networks/__init__.py index 078d771..1635039 100644 --- a/src/text_recognizer/networks/__init__.py +++ b/src/text_recognizer/networks/__init__.py @@ -7,7 +7,6 @@ from .lenet import LeNet from .metrics import accuracy, accuracy_ignore_pad, cer, wer from .mlp import MLP from .residual_network import ResidualNetwork, ResidualNetworkEncoder -from .sparse_mlp import SparseMLP from .transformer import Transformer from .util import sliding_window from .wide_resnet import WideResidualNetwork @@ -26,7 +25,6 @@ __all__ = [ "ResidualNetworkEncoder", "sliding_window", "Transformer", - "SparseMLP", "wer", "WideResidualNetwork", ] diff --git a/src/text_recognizer/networks/sparse_mlp.py b/src/text_recognizer/networks/sparse_mlp.py deleted file mode 100644 index 53cf166..0000000 --- a/src/text_recognizer/networks/sparse_mlp.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Defines the Sparse MLP network.""" -from typing import Callable, Dict, List, Optional, Union -import warnings - -from einops.layers.torch import Rearrange -from pytorch_block_sparse import BlockSparseLinear -import torch -from torch import nn - -from text_recognizer.networks.util import activation_function - -warnings.filterwarnings("ignore", category=DeprecationWarning) - - -class SparseMLP(nn.Module): - """Sparse multi layered perceptron network.""" - - def __init__( - self, - input_size: int = 784, - num_classes: int = 10, - hidden_size: Union[int, List] = 128, - num_layers: int = 3, - density: float = 0.1, - activation_fn: str = "relu", - ) -> None: - """Initialization of the MLP network. - - Args: - input_size (int): The input shape of the network. Defaults to 784. - num_classes (int): Number of classes in the dataset. Defaults to 10. - hidden_size (Union[int, List]): The number of `neurons` in each hidden layer. Defaults to 128. - num_layers (int): The number of hidden layers. Defaults to 3. - density (float): The density of activation at each layer. Default to 0.1. - activation_fn (str): Name of the activation function in the hidden layers. Defaults to - relu. - - """ - super().__init__() - - activation_fn = activation_function(activation_fn) - - if isinstance(hidden_size, int): - hidden_size = [hidden_size] * num_layers - - self.layers = [ - Rearrange("b c h w -> b (c h w)"), - nn.Linear(in_features=input_size, out_features=hidden_size[0]), - activation_fn, - ] - - for i in range(num_layers - 1): - self.layers += [ - BlockSparseLinear( - in_features=hidden_size[i], - out_features=hidden_size[i + 1], - density=density, - ), - activation_fn, - ] - - self.layers.append( - nn.Linear(in_features=hidden_size[-1], out_features=num_classes) - ) - - self.layers = nn.Sequential(*self.layers) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """The feedforward pass.""" - # If batch dimenstion is missing, it needs to be added. - if len(x.shape) < 4: - x = x[(None,) * (4 - len(x.shape))] - return self.layers(x) - - @property - def __name__(self) -> str: - """Returns the name of the network.""" - return "mlp" diff --git a/src/text_recognizer/networks/unet.py b/src/text_recognizer/networks/unet.py new file mode 100644 index 0000000..51f242a --- /dev/null +++ b/src/text_recognizer/networks/unet.py @@ -0,0 +1,158 @@ +"""UNet for segmentation.""" +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn +from torch import Tensor + +from text_recognizer.networks.util import activation_function + + +class ConvBlock(nn.Module): + """Basic UNet convolutional block.""" + + def __init__(self, channels: List[int], activation: str) -> None: + super().__init__() + self.channels = channels + self.activation = activation_function(activation) + self.block = self._configure_block() + + def _configure_block(self) -> nn.Sequential: + block = [] + for i in range(len(self.channels) - 1): + block += [ + nn.Conv2d( + self.channels[i], self.channels[i + 1], kernel_size=3, padding=1 + ), + nn.BatchNorm2d(self.channels[i + 1]), + self.activation, + ] + + return nn.Sequential(*block) + + def forward(self, x: Tensor) -> Tensor: + """Apply the convolutional block.""" + return self.block(x) + + +class DownSamplingBlock(nn.Module): + """Basic down sampling block.""" + + def __init__( + self, + channels: List[int], + activation: str, + pooling_kernel: Union[int, bool] = 2, + ) -> None: + super().__init__() + self.conv_block = ConvBlock(channels, activation) + self.down_sampling = nn.MaxPool2d(pooling_kernel) if pooling_kernel else None + + def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]: + """Return the convolutional block output and a down sampled tensor.""" + x = self.conv_block(x) + if self.down_sampling is not None: + x_down = self.down_sampling(x) + else: + x_down = None + return x_down, x + + +class UpSamplingBlock(nn.Module): + """The upsampling block of the UNet.""" + + def __init__( + self, channels: List[int], activation: str, scale_factor: int = 2 + ) -> None: + super().__init__() + self.conv_block = ConvBlock(channels, activation) + self.up_sampling = nn.Upsample( + scale_factor=scale_factor, mode="bilinear", align_corners=True + ) + + def forward(self, x: Tensor, x_skip: Optional[Tensor] = None) -> Tensor: + """Apply the up sampling and convolutional block.""" + x = self.up_sampling(x) + if x_skip is not None: + x = torch.cat((x, x_skip), dim=1) + return self.conv_block(x) + + +class UNet(nn.Module): + """UNet architecture.""" + + def __init__( + self, + in_channels: int = 1, + base_channels: int = 64, + num_classes: int = 3, + depth: int = 4, + out_channels: int = 3, + activation: str = "relu", + pooling_kernel: int = 2, + scale_factor: int = 2, + ) -> None: + super().__init__() + self.depth = depth + channels = [1] + [base_channels * 2 ** i for i in range(depth)] + self.encoder_blocks = self._configure_down_sampling_blocks( + channels, activation, pooling_kernel + ) + self.decoder_blocks = self._configure_up_sampling_blocks( + channels, activation, scale_factor + ) + + self.head = nn.Conv2d(base_channels, num_classes, kernel_size=1) + + def _configure_down_sampling_blocks( + self, channels: List[int], activation: str, pooling_kernel: int + ) -> nn.ModuleList: + blocks = nn.ModuleList([]) + for i in range(len(channels) - 1): + pooling_kernel = pooling_kernel if i < self.depth - 1 else False + blocks += [ + DownSamplingBlock( + [channels[i], channels[i + 1], channels[i + 1]], + activation, + pooling_kernel, + ) + ] + + return blocks + + def _configure_up_sampling_blocks( + self, + channels: List[int], + activation: str, + scale_factor: int, + ) -> nn.ModuleList: + channels.reverse() + return nn.ModuleList( + [ + UpSamplingBlock( + [channels[i] + channels[i + 1], channels[i + 1], channels[i + 1]], + activation, + scale_factor, + ) + for i in range(len(channels) - 2) + ] + ) + + def encode(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]: + x_skips = [] + for block in self.encoder_blocks: + x, x_skip = block(x) + if x_skip is not None: + x_skips.append(x_skip) + return x, x_skips + + def decode(self, x: Tensor, x_skips: List[Tensor]) -> Tensor: + x = x_skips[-1] + for i, block in enumerate(self.decoder_blocks): + x = block(x, x_skips[-(i + 2)]) + return x + + def forward(self, x: Tensor) -> Tensor: + x, x_skips = self.encode(x) + x = self.decode(x, x_skips) + return self.head(x) |