1 files changed, 128 insertions, 31 deletions
diff --git a/src/text_recognizer/networks/unet.py b/src/text_recognizer/networks/unet.py
index 51f242a..510910f 100644
--- a/src/text_recognizer/networks/unet.py
+++ b/src/text_recognizer/networks/unet.py
@@ -8,64 +8,118 @@ from torch import Tensor
 from text_recognizer.networks.util import activation_function
 
 
-class ConvBlock(nn.Module):
-    """Basic UNet convolutional block."""
+class _ConvBlock(nn.Module):
+    """Modified UNet convolutional block with dilation."""
 
-    def __init__(self, channels: List[int], activation: str) -> None:
+    def __init__(
+        self,
+        channels: List[int],
+        activation: str,
+        num_groups: int,
+        dropout_rate: float = 0.1,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        padding: int = 0,
+    ) -> None:
         super().__init__()
         self.channels = channels
+        self.dropout_rate = dropout_rate
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.padding = padding
+        self.num_groups = num_groups
         self.activation = activation_function(activation)
         self.block = self._configure_block()
+        self.residual_conv = nn.Sequential(
+            nn.Conv2d(
+                self.channels[0], self.channels[-1], kernel_size=3, stride=1, padding=1
+            ),
+            self.activation,
+        )
 
     def _configure_block(self) -> nn.Sequential:
         block = []
         for i in range(len(self.channels) - 1):
             block += [
+                nn.Dropout(p=self.dropout_rate),
+                nn.GroupNorm(self.num_groups, self.channels[i]),
+                self.activation,
                 nn.Conv2d(
-                    self.channels[i], self.channels[i + 1], kernel_size=3, padding=1
+                    self.channels[i],
+                    self.channels[i + 1],
+                    kernel_size=self.kernel_size,
+                    padding=self.padding,
+                    stride=1,
+                    dilation=self.dilation,
                 ),
-                nn.BatchNorm2d(self.channels[i + 1]),
-                self.activation,
             ]
 
         return nn.Sequential(*block)
 
     def forward(self, x: Tensor) -> Tensor:
         """Apply the convolutional block."""
-        return self.block(x)
+        residual = self.residual_conv(x)
+        return self.block(x) + residual
 
 
-class DownSamplingBlock(nn.Module):
+class _DownSamplingBlock(nn.Module):
     """Basic down sampling block."""
 
     def __init__(
         self,
         channels: List[int],
         activation: str,
+        num_groups: int,
         pooling_kernel: Union[int, bool] = 2,
+        dropout_rate: float = 0.1,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        padding: int = 0,
     ) -> None:
         super().__init__()
-        self.conv_block = ConvBlock(channels, activation)
+        self.conv_block = _ConvBlock(
+            channels,
+            activation,
+            num_groups,
+            dropout_rate,
+            kernel_size,
+            dilation,
+            padding,
+        )
         self.down_sampling = nn.MaxPool2d(pooling_kernel) if pooling_kernel else None
 
     def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
         """Return the convolutional block output and a down sampled tensor."""
         x = self.conv_block(x)
-        if self.down_sampling is not None:
-            x_down = self.down_sampling(x)
-        else:
-            x_down = None
+        x_down = self.down_sampling(x) if self.down_sampling is not None else x
+
         return x_down, x
 
 
-class UpSamplingBlock(nn.Module):
+class _UpSamplingBlock(nn.Module):
     """The upsampling block of the UNet."""
 
     def __init__(
-        self, channels: List[int], activation: str, scale_factor: int = 2
+        self,
+        channels: List[int],
+        activation: str,
+        num_groups: int,
+        scale_factor: int = 2,
+        dropout_rate: float = 0.1,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        padding: int = 0,
     ) -> None:
         super().__init__()
-        self.conv_block = ConvBlock(channels, activation)
+        self.conv_block = _ConvBlock(
+            channels,
+            activation,
+            num_groups,
+            dropout_rate,
+            kernel_size,
+            dilation,
+            padding,
+        )
         self.up_sampling = nn.Upsample(
             scale_factor=scale_factor, mode="bilinear", align_corners=True
         )
@@ -87,14 +141,43 @@ class UNet(nn.Module):
         base_channels: int = 64,
         num_classes: int = 3,
         depth: int = 4,
-        out_channels: int = 3,
         activation: str = "relu",
+        num_groups: int = 8,
+        dropout_rate: float = 0.1,
         pooling_kernel: int = 2,
         scale_factor: int = 2,
+        kernel_size: Optional[List[int]] = None,
+        dilation: Optional[List[int]] = None,
+        padding: Optional[List[int]] = None,
     ) -> None:
         super().__init__()
         self.depth = depth
-        channels = [1] + [base_channels * 2 ** i for i in range(depth)]
+        self.num_groups = num_groups
+
+        if kernel_size is not None and dilation is not None and padding is not None:
+            if (
+                len(kernel_size) != depth
+                and len(dilation) != depth
+                and len(padding) != depth
+            ):
+                raise RuntimeError(
+                    "Length of convolutional parameters does not match the depth."
+                )
+            self.kernel_size = kernel_size
+            self.padding = padding
+            self.dilation = dilation
+
+        else:
+            self.kernel_size = [3] * depth
+            self.padding = [1] * depth
+            self.dilation = [1] * depth
+
+        self.dropout_rate = dropout_rate
+        self.conv = nn.Conv2d(
+            in_channels, base_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        channels = [base_channels] + [base_channels * 2 ** i for i in range(depth)]
         self.encoder_blocks = self._configure_down_sampling_blocks(
             channels, activation, pooling_kernel
         )
@@ -110,49 +193,63 @@ class UNet(nn.Module):
         blocks = nn.ModuleList([])
         for i in range(len(channels) - 1):
             pooling_kernel = pooling_kernel if i < self.depth - 1 else False
+            dropout_rate = self.dropout_rate if i < 0 else 0
             blocks += [
-                DownSamplingBlock(
+                _DownSamplingBlock(
                     [channels[i], channels[i + 1], channels[i + 1]],
                     activation,
+                    self.num_groups,
                     pooling_kernel,
+                    dropout_rate,
+                    self.kernel_size[i],
+                    self.dilation[i],
+                    self.padding[i],
                 )
             ]
 
         return blocks
 
     def _configure_up_sampling_blocks(
-        self,
-        channels: List[int],
-        activation: str,
-        scale_factor: int,
+        self, channels: List[int], activation: str, scale_factor: int,
     ) -> nn.ModuleList:
         channels.reverse()
+        self.kernel_size.reverse()
+        self.dilation.reverse()
+        self.padding.reverse()
         return nn.ModuleList(
             [
-                UpSamplingBlock(
+                _UpSamplingBlock(
                     [channels[i] + channels[i + 1], channels[i + 1], channels[i + 1]],
                     activation,
+                    self.num_groups,
                     scale_factor,
+                    self.dropout_rate,
+                    self.kernel_size[i],
+                    self.dilation[i],
+                    self.padding[i],
                 )
                 for i in range(len(channels) - 2)
             ]
         )
 
-    def encode(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]:
+    def _encode(self, x: Tensor) -> List[Tensor]:
         x_skips = []
         for block in self.encoder_blocks:
             x, x_skip = block(x)
-            if x_skip is not None:
-                x_skips.append(x_skip)
-        return x, x_skips
+            x_skips.append(x_skip)
+        return x_skips
 
-    def decode(self, x: Tensor, x_skips: List[Tensor]) -> Tensor:
+    def _decode(self, x_skips: List[Tensor]) -> Tensor:
         x = x_skips[-1]
         for i, block in enumerate(self.decoder_blocks):
             x = block(x, x_skips[-(i + 2)])
         return x
 
     def forward(self, x: Tensor) -> Tensor:
-        x, x_skips = self.encode(x)
-        x = self.decode(x, x_skips)
+        """Forward pass with the UNet model."""
+        if len(x.shape) < 4:
+            x = x[(None,) * (4 - len(x.shape))]
+        x = self.conv(x)
+        x_skips = self._encode(x)
+        x = self._decode(x_skips)
         return self.head(x)