summaryrefslogtreecommitdiff
path: root/src/text_recognizer
diff options
context:
space:
mode:
Diffstat (limited to 'src/text_recognizer')
-rw-r--r--src/text_recognizer/networks/__init__.py2
-rw-r--r--src/text_recognizer/networks/sparse_mlp.py78
-rw-r--r--src/text_recognizer/networks/unet.py158
3 files changed, 158 insertions, 80 deletions
diff --git a/src/text_recognizer/networks/__init__.py b/src/text_recognizer/networks/__init__.py
index 078d771..1635039 100644
--- a/src/text_recognizer/networks/__init__.py
+++ b/src/text_recognizer/networks/__init__.py
@@ -7,7 +7,6 @@ from .lenet import LeNet
from .metrics import accuracy, accuracy_ignore_pad, cer, wer
from .mlp import MLP
from .residual_network import ResidualNetwork, ResidualNetworkEncoder
-from .sparse_mlp import SparseMLP
from .transformer import Transformer
from .util import sliding_window
from .wide_resnet import WideResidualNetwork
@@ -26,7 +25,6 @@ __all__ = [
"ResidualNetworkEncoder",
"sliding_window",
"Transformer",
- "SparseMLP",
"wer",
"WideResidualNetwork",
]
diff --git a/src/text_recognizer/networks/sparse_mlp.py b/src/text_recognizer/networks/sparse_mlp.py
deleted file mode 100644
index 53cf166..0000000
--- a/src/text_recognizer/networks/sparse_mlp.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Defines the Sparse MLP network."""
-from typing import Callable, Dict, List, Optional, Union
-import warnings
-
-from einops.layers.torch import Rearrange
-from pytorch_block_sparse import BlockSparseLinear
-import torch
-from torch import nn
-
-from text_recognizer.networks.util import activation_function
-
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-
-
-class SparseMLP(nn.Module):
- """Sparse multi layered perceptron network."""
-
- def __init__(
- self,
- input_size: int = 784,
- num_classes: int = 10,
- hidden_size: Union[int, List] = 128,
- num_layers: int = 3,
- density: float = 0.1,
- activation_fn: str = "relu",
- ) -> None:
- """Initialization of the MLP network.
-
- Args:
- input_size (int): The input shape of the network. Defaults to 784.
- num_classes (int): Number of classes in the dataset. Defaults to 10.
- hidden_size (Union[int, List]): The number of `neurons` in each hidden layer. Defaults to 128.
- num_layers (int): The number of hidden layers. Defaults to 3.
- density (float): The density of activation at each layer. Default to 0.1.
- activation_fn (str): Name of the activation function in the hidden layers. Defaults to
- relu.
-
- """
- super().__init__()
-
- activation_fn = activation_function(activation_fn)
-
- if isinstance(hidden_size, int):
- hidden_size = [hidden_size] * num_layers
-
- self.layers = [
- Rearrange("b c h w -> b (c h w)"),
- nn.Linear(in_features=input_size, out_features=hidden_size[0]),
- activation_fn,
- ]
-
- for i in range(num_layers - 1):
- self.layers += [
- BlockSparseLinear(
- in_features=hidden_size[i],
- out_features=hidden_size[i + 1],
- density=density,
- ),
- activation_fn,
- ]
-
- self.layers.append(
- nn.Linear(in_features=hidden_size[-1], out_features=num_classes)
- )
-
- self.layers = nn.Sequential(*self.layers)
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- """The feedforward pass."""
- # If batch dimenstion is missing, it needs to be added.
- if len(x.shape) < 4:
- x = x[(None,) * (4 - len(x.shape))]
- return self.layers(x)
-
- @property
- def __name__(self) -> str:
- """Returns the name of the network."""
- return "mlp"
diff --git a/src/text_recognizer/networks/unet.py b/src/text_recognizer/networks/unet.py
new file mode 100644
index 0000000..51f242a
--- /dev/null
+++ b/src/text_recognizer/networks/unet.py
@@ -0,0 +1,158 @@
+"""UNet for segmentation."""
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch import Tensor
+
+from text_recognizer.networks.util import activation_function
+
+
+class ConvBlock(nn.Module):
+ """Basic UNet convolutional block."""
+
+ def __init__(self, channels: List[int], activation: str) -> None:
+ super().__init__()
+ self.channels = channels
+ self.activation = activation_function(activation)
+ self.block = self._configure_block()
+
+ def _configure_block(self) -> nn.Sequential:
+ block = []
+ for i in range(len(self.channels) - 1):
+ block += [
+ nn.Conv2d(
+ self.channels[i], self.channels[i + 1], kernel_size=3, padding=1
+ ),
+ nn.BatchNorm2d(self.channels[i + 1]),
+ self.activation,
+ ]
+
+ return nn.Sequential(*block)
+
+ def forward(self, x: Tensor) -> Tensor:
+ """Apply the convolutional block."""
+ return self.block(x)
+
+
+class DownSamplingBlock(nn.Module):
+ """Basic down sampling block."""
+
+ def __init__(
+ self,
+ channels: List[int],
+ activation: str,
+ pooling_kernel: Union[int, bool] = 2,
+ ) -> None:
+ super().__init__()
+ self.conv_block = ConvBlock(channels, activation)
+ self.down_sampling = nn.MaxPool2d(pooling_kernel) if pooling_kernel else None
+
+ def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+ """Return the convolutional block output and a down sampled tensor."""
+ x = self.conv_block(x)
+ if self.down_sampling is not None:
+ x_down = self.down_sampling(x)
+ else:
+ x_down = None
+ return x_down, x
+
+
+class UpSamplingBlock(nn.Module):
+ """The upsampling block of the UNet."""
+
+ def __init__(
+ self, channels: List[int], activation: str, scale_factor: int = 2
+ ) -> None:
+ super().__init__()
+ self.conv_block = ConvBlock(channels, activation)
+ self.up_sampling = nn.Upsample(
+ scale_factor=scale_factor, mode="bilinear", align_corners=True
+ )
+
+ def forward(self, x: Tensor, x_skip: Optional[Tensor] = None) -> Tensor:
+ """Apply the up sampling and convolutional block."""
+ x = self.up_sampling(x)
+ if x_skip is not None:
+ x = torch.cat((x, x_skip), dim=1)
+ return self.conv_block(x)
+
+
+class UNet(nn.Module):
+ """UNet architecture."""
+
+ def __init__(
+ self,
+ in_channels: int = 1,
+ base_channels: int = 64,
+ num_classes: int = 3,
+ depth: int = 4,
+ out_channels: int = 3,
+ activation: str = "relu",
+ pooling_kernel: int = 2,
+ scale_factor: int = 2,
+ ) -> None:
+ super().__init__()
+ self.depth = depth
+ channels = [1] + [base_channels * 2 ** i for i in range(depth)]
+ self.encoder_blocks = self._configure_down_sampling_blocks(
+ channels, activation, pooling_kernel
+ )
+ self.decoder_blocks = self._configure_up_sampling_blocks(
+ channels, activation, scale_factor
+ )
+
+ self.head = nn.Conv2d(base_channels, num_classes, kernel_size=1)
+
+ def _configure_down_sampling_blocks(
+ self, channels: List[int], activation: str, pooling_kernel: int
+ ) -> nn.ModuleList:
+ blocks = nn.ModuleList([])
+ for i in range(len(channels) - 1):
+ pooling_kernel = pooling_kernel if i < self.depth - 1 else False
+ blocks += [
+ DownSamplingBlock(
+ [channels[i], channels[i + 1], channels[i + 1]],
+ activation,
+ pooling_kernel,
+ )
+ ]
+
+ return blocks
+
+ def _configure_up_sampling_blocks(
+ self,
+ channels: List[int],
+ activation: str,
+ scale_factor: int,
+ ) -> nn.ModuleList:
+ channels.reverse()
+ return nn.ModuleList(
+ [
+ UpSamplingBlock(
+ [channels[i] + channels[i + 1], channels[i + 1], channels[i + 1]],
+ activation,
+ scale_factor,
+ )
+ for i in range(len(channels) - 2)
+ ]
+ )
+
+ def encode(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]:
+ x_skips = []
+ for block in self.encoder_blocks:
+ x, x_skip = block(x)
+ if x_skip is not None:
+ x_skips.append(x_skip)
+ return x, x_skips
+
+ def decode(self, x: Tensor, x_skips: List[Tensor]) -> Tensor:
+ x = x_skips[-1]
+ for i, block in enumerate(self.decoder_blocks):
+ x = block(x, x_skips[-(i + 2)])
+ return x
+
+ def forward(self, x: Tensor) -> Tensor:
+ x, x_skips = self.encode(x)
+ x = self.decode(x, x_skips)
+ return self.head(x)