summaryrefslogtreecommitdiff
path: root/src/text_recognizer/networks/unet.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/text_recognizer/networks/unet.py')
-rw-r--r--src/text_recognizer/networks/unet.py159
1 files changed, 128 insertions, 31 deletions
diff --git a/src/text_recognizer/networks/unet.py b/src/text_recognizer/networks/unet.py
index 51f242a..510910f 100644
--- a/src/text_recognizer/networks/unet.py
+++ b/src/text_recognizer/networks/unet.py
@@ -8,64 +8,118 @@ from torch import Tensor
from text_recognizer.networks.util import activation_function
-class ConvBlock(nn.Module):
- """Basic UNet convolutional block."""
+class _ConvBlock(nn.Module):
+ """Modified UNet convolutional block with dilation."""
- def __init__(self, channels: List[int], activation: str) -> None:
+ def __init__(
+ self,
+ channels: List[int],
+ activation: str,
+ num_groups: int,
+ dropout_rate: float = 0.1,
+ kernel_size: int = 3,
+ dilation: int = 1,
+ padding: int = 0,
+ ) -> None:
super().__init__()
self.channels = channels
+ self.dropout_rate = dropout_rate
+ self.kernel_size = kernel_size
+ self.dilation = dilation
+ self.padding = padding
+ self.num_groups = num_groups
self.activation = activation_function(activation)
self.block = self._configure_block()
+ self.residual_conv = nn.Sequential(
+ nn.Conv2d(
+ self.channels[0], self.channels[-1], kernel_size=3, stride=1, padding=1
+ ),
+ self.activation,
+ )
def _configure_block(self) -> nn.Sequential:
block = []
for i in range(len(self.channels) - 1):
block += [
+ nn.Dropout(p=self.dropout_rate),
+ nn.GroupNorm(self.num_groups, self.channels[i]),
+ self.activation,
nn.Conv2d(
- self.channels[i], self.channels[i + 1], kernel_size=3, padding=1
+ self.channels[i],
+ self.channels[i + 1],
+ kernel_size=self.kernel_size,
+ padding=self.padding,
+ stride=1,
+ dilation=self.dilation,
),
- nn.BatchNorm2d(self.channels[i + 1]),
- self.activation,
]
return nn.Sequential(*block)
def forward(self, x: Tensor) -> Tensor:
"""Apply the convolutional block."""
- return self.block(x)
+ residual = self.residual_conv(x)
+ return self.block(x) + residual
-class DownSamplingBlock(nn.Module):
+class _DownSamplingBlock(nn.Module):
"""Basic down sampling block."""
def __init__(
self,
channels: List[int],
activation: str,
+ num_groups: int,
pooling_kernel: Union[int, bool] = 2,
+ dropout_rate: float = 0.1,
+ kernel_size: int = 3,
+ dilation: int = 1,
+ padding: int = 0,
) -> None:
super().__init__()
- self.conv_block = ConvBlock(channels, activation)
+ self.conv_block = _ConvBlock(
+ channels,
+ activation,
+ num_groups,
+ dropout_rate,
+ kernel_size,
+ dilation,
+ padding,
+ )
self.down_sampling = nn.MaxPool2d(pooling_kernel) if pooling_kernel else None
def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
"""Return the convolutional block output and a down sampled tensor."""
x = self.conv_block(x)
- if self.down_sampling is not None:
- x_down = self.down_sampling(x)
- else:
- x_down = None
+ x_down = self.down_sampling(x) if self.down_sampling is not None else x
+
return x_down, x
-class UpSamplingBlock(nn.Module):
+class _UpSamplingBlock(nn.Module):
"""The upsampling block of the UNet."""
def __init__(
- self, channels: List[int], activation: str, scale_factor: int = 2
+ self,
+ channels: List[int],
+ activation: str,
+ num_groups: int,
+ scale_factor: int = 2,
+ dropout_rate: float = 0.1,
+ kernel_size: int = 3,
+ dilation: int = 1,
+ padding: int = 0,
) -> None:
super().__init__()
- self.conv_block = ConvBlock(channels, activation)
+ self.conv_block = _ConvBlock(
+ channels,
+ activation,
+ num_groups,
+ dropout_rate,
+ kernel_size,
+ dilation,
+ padding,
+ )
self.up_sampling = nn.Upsample(
scale_factor=scale_factor, mode="bilinear", align_corners=True
)
@@ -87,14 +141,43 @@ class UNet(nn.Module):
base_channels: int = 64,
num_classes: int = 3,
depth: int = 4,
- out_channels: int = 3,
activation: str = "relu",
+ num_groups: int = 8,
+ dropout_rate: float = 0.1,
pooling_kernel: int = 2,
scale_factor: int = 2,
+ kernel_size: Optional[List[int]] = None,
+ dilation: Optional[List[int]] = None,
+ padding: Optional[List[int]] = None,
) -> None:
super().__init__()
self.depth = depth
- channels = [1] + [base_channels * 2 ** i for i in range(depth)]
+ self.num_groups = num_groups
+
+ if kernel_size is not None and dilation is not None and padding is not None:
+ if (
+ len(kernel_size) != depth
+ and len(dilation) != depth
+ and len(padding) != depth
+ ):
+ raise RuntimeError(
+ "Length of convolutional parameters does not match the depth."
+ )
+ self.kernel_size = kernel_size
+ self.padding = padding
+ self.dilation = dilation
+
+ else:
+ self.kernel_size = [3] * depth
+ self.padding = [1] * depth
+ self.dilation = [1] * depth
+
+ self.dropout_rate = dropout_rate
+ self.conv = nn.Conv2d(
+ in_channels, base_channels, kernel_size=3, stride=1, padding=1
+ )
+
+ channels = [base_channels] + [base_channels * 2 ** i for i in range(depth)]
self.encoder_blocks = self._configure_down_sampling_blocks(
channels, activation, pooling_kernel
)
@@ -110,49 +193,63 @@ class UNet(nn.Module):
blocks = nn.ModuleList([])
for i in range(len(channels) - 1):
pooling_kernel = pooling_kernel if i < self.depth - 1 else False
+ dropout_rate = self.dropout_rate if i < 0 else 0
blocks += [
- DownSamplingBlock(
+ _DownSamplingBlock(
[channels[i], channels[i + 1], channels[i + 1]],
activation,
+ self.num_groups,
pooling_kernel,
+ dropout_rate,
+ self.kernel_size[i],
+ self.dilation[i],
+ self.padding[i],
)
]
return blocks
def _configure_up_sampling_blocks(
- self,
- channels: List[int],
- activation: str,
- scale_factor: int,
+ self, channels: List[int], activation: str, scale_factor: int,
) -> nn.ModuleList:
channels.reverse()
+ self.kernel_size.reverse()
+ self.dilation.reverse()
+ self.padding.reverse()
return nn.ModuleList(
[
- UpSamplingBlock(
+ _UpSamplingBlock(
[channels[i] + channels[i + 1], channels[i + 1], channels[i + 1]],
activation,
+ self.num_groups,
scale_factor,
+ self.dropout_rate,
+ self.kernel_size[i],
+ self.dilation[i],
+ self.padding[i],
)
for i in range(len(channels) - 2)
]
)
- def encode(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]:
+ def _encode(self, x: Tensor) -> List[Tensor]:
x_skips = []
for block in self.encoder_blocks:
x, x_skip = block(x)
- if x_skip is not None:
- x_skips.append(x_skip)
- return x, x_skips
+ x_skips.append(x_skip)
+ return x_skips
- def decode(self, x: Tensor, x_skips: List[Tensor]) -> Tensor:
+ def _decode(self, x_skips: List[Tensor]) -> Tensor:
x = x_skips[-1]
for i, block in enumerate(self.decoder_blocks):
x = block(x, x_skips[-(i + 2)])
return x
def forward(self, x: Tensor) -> Tensor:
- x, x_skips = self.encode(x)
- x = self.decode(x, x_skips)
+ """Forward pass with the UNet model."""
+ if len(x.shape) < 4:
+ x = x[(None,) * (4 - len(x.shape))]
+ x = self.conv(x)
+ x_skips = self._encode(x)
+ x = self._decode(x_skips)
return self.head(x)