5 files changed, 494 insertions, 81 deletions
diff --git a/src/notebooks/00-testing-stuff-out.ipynb b/src/notebooks/00-testing-stuff-out.ipynb
index 3686dcd..96a0c5a 100644
--- a/src/notebooks/00-testing-stuff-out.ipynb
+++ b/src/notebooks/00-testing-stuff-out.ipynb
@@ -1352,7 +1352,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.2"
   }
  },
  "nbformat": 4,
diff --git a/src/notebooks/05a-UNet.ipynb b/src/notebooks/05a-UNet.ipynb
new file mode 100644
index 0000000..c25865a
--- /dev/null
+++ b/src/notebooks/05a-UNet.ipynb
@@ -0,0 +1,335 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from PIL import Image\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from importlib.util import find_spec\n",
+    "if find_spec(\"text_recognizer\") is None:\n",
+    "    import sys\n",
+    "    sys.path.append('..')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = 64\n",
+    "depth = 4\n",
+    "channels = [x * 2 ** i for i in range(4)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "channels.reverse()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[512, 256, 128, 64]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "channels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = nn.ModuleList([nn.Conv2d(1,3,2), nn.Linear(1, 5)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleAttributeError",
+     "evalue": "'ModuleList' object has no attribute 'reverse'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleAttributeError\u001b[0m                      Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-12-56d7987510bf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreverse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/text-recognizer-cxOiES-R-py3.8/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m    769\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    770\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m         raise ModuleAttributeError(\"'{}' object has no attribute '{}'\".format(\n\u001b[0m\u001b[1;32m    772\u001b[0m             type(self).__name__, name))\n\u001b[1;32m    773\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleAttributeError\u001b[0m: 'ModuleList' object has no attribute 'reverse'"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from text_recognizer.networks.unet import UNet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net = UNet()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = torch.rand(1, 1, 256, 256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0): DownSamplingBlock(\n",
+       "    (conv_block): ConvBlock(\n",
+       "      (activation): ReLU(inplace=True)\n",
+       "      (block): Sequential(\n",
+       "        (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (2): ReLU(inplace=True)\n",
+       "        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (5): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
+       "  )\n",
+       "  (1): DownSamplingBlock(\n",
+       "    (conv_block): ConvBlock(\n",
+       "      (activation): ReLU(inplace=True)\n",
+       "      (block): Sequential(\n",
+       "        (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (2): ReLU(inplace=True)\n",
+       "        (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (5): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
+       "  )\n",
+       "  (2): DownSamplingBlock(\n",
+       "    (conv_block): ConvBlock(\n",
+       "      (activation): ReLU(inplace=True)\n",
+       "      (block): Sequential(\n",
+       "        (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (2): ReLU(inplace=True)\n",
+       "        (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (5): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
+       "  )\n",
+       "  (3): DownSamplingBlock(\n",
+       "    (conv_block): ConvBlock(\n",
+       "      (activation): ReLU(inplace=True)\n",
+       "      (block): Sequential(\n",
+       "        (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (2): ReLU(inplace=True)\n",
+       "        (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (5): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 101,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "net.encoder_blocks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0): UpSamplingBlock(\n",
+       "    (conv_block): ConvBlock(\n",
+       "      (activation): ReLU(inplace=True)\n",
+       "      (block): Sequential(\n",
+       "        (0): Conv2d(768, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (2): ReLU(inplace=True)\n",
+       "        (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (5): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n",
+       "  )\n",
+       "  (1): UpSamplingBlock(\n",
+       "    (conv_block): ConvBlock(\n",
+       "      (activation): ReLU(inplace=True)\n",
+       "      (block): Sequential(\n",
+       "        (0): Conv2d(384, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (2): ReLU(inplace=True)\n",
+       "        (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (5): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n",
+       "  )\n",
+       "  (2): UpSamplingBlock(\n",
+       "    (conv_block): ConvBlock(\n",
+       "      (activation): ReLU(inplace=True)\n",
+       "      (block): Sequential(\n",
+       "        (0): Conv2d(192, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (2): ReLU(inplace=True)\n",
+       "        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+       "        (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "        (5): ReLU(inplace=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 102,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "net.decoder_blocks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Conv2d(64, 3, kernel_size=(1, 1), stride=(1, 1))"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "net.head"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 3, 256, 256])"
+      ]
+     },
+     "execution_count": 103,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "net(x).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/text_recognizer/networks/__init__.py b/src/text_recognizer/networks/__init__.py
index 078d771..1635039 100644
--- a/src/text_recognizer/networks/__init__.py
+++ b/src/text_recognizer/networks/__init__.py
@@ -7,7 +7,6 @@ from .lenet import LeNet
 from .metrics import accuracy, accuracy_ignore_pad, cer, wer
 from .mlp import MLP
 from .residual_network import ResidualNetwork, ResidualNetworkEncoder
-from .sparse_mlp import SparseMLP
 from .transformer import Transformer
 from .util import sliding_window
 from .wide_resnet import WideResidualNetwork
@@ -26,7 +25,6 @@ __all__ = [
     "ResidualNetworkEncoder",
     "sliding_window",
     "Transformer",
-    "SparseMLP",
     "wer",
     "WideResidualNetwork",
 ]
diff --git a/src/text_recognizer/networks/sparse_mlp.py b/src/text_recognizer/networks/sparse_mlp.py
deleted file mode 100644
index 53cf166..0000000
--- a/src/text_recognizer/networks/sparse_mlp.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Defines the Sparse MLP network."""
-from typing import Callable, Dict, List, Optional, Union
-import warnings
-
-from einops.layers.torch import Rearrange
-from pytorch_block_sparse import BlockSparseLinear
-import torch
-from torch import nn
-
-from text_recognizer.networks.util import activation_function
-
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-
-
-class SparseMLP(nn.Module):
-    """Sparse multi layered perceptron network."""
-
-    def __init__(
-        self,
-        input_size: int = 784,
-        num_classes: int = 10,
-        hidden_size: Union[int, List] = 128,
-        num_layers: int = 3,
-        density: float = 0.1,
-        activation_fn: str = "relu",
-    ) -> None:
-        """Initialization of the MLP network.
-
-        Args:
-            input_size (int): The input shape of the network. Defaults to 784.
-            num_classes (int): Number of classes in the dataset. Defaults to 10.
-            hidden_size (Union[int, List]): The number of `neurons` in each hidden layer. Defaults to 128.
-            num_layers (int): The number of hidden layers. Defaults to 3.
-            density (float): The density of activation at each layer. Default to 0.1.
-            activation_fn (str): Name of the activation function in the hidden layers. Defaults to
-                relu.
-
-        """
-        super().__init__()
-
-        activation_fn = activation_function(activation_fn)
-
-        if isinstance(hidden_size, int):
-            hidden_size = [hidden_size] * num_layers
-
-        self.layers = [
-            Rearrange("b c h w -> b (c h w)"),
-            nn.Linear(in_features=input_size, out_features=hidden_size[0]),
-            activation_fn,
-        ]
-
-        for i in range(num_layers - 1):
-            self.layers += [
-                BlockSparseLinear(
-                    in_features=hidden_size[i],
-                    out_features=hidden_size[i + 1],
-                    density=density,
-                ),
-                activation_fn,
-            ]
-
-        self.layers.append(
-            nn.Linear(in_features=hidden_size[-1], out_features=num_classes)
-        )
-
-        self.layers = nn.Sequential(*self.layers)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """The feedforward pass."""
-        # If batch dimenstion is missing, it needs to be added.
-        if len(x.shape) < 4:
-            x = x[(None,) * (4 - len(x.shape))]
-        return self.layers(x)
-
-    @property
-    def __name__(self) -> str:
-        """Returns the name of the network."""
-        return "mlp"
diff --git a/src/text_recognizer/networks/unet.py b/src/text_recognizer/networks/unet.py
new file mode 100644
index 0000000..51f242a
--- /dev/null
+++ b/src/text_recognizer/networks/unet.py
@@ -0,0 +1,158 @@
+"""UNet for segmentation."""
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch import Tensor
+
+from text_recognizer.networks.util import activation_function
+
+
+class ConvBlock(nn.Module):
+    """Basic UNet convolutional block."""
+
+    def __init__(self, channels: List[int], activation: str) -> None:
+        super().__init__()
+        self.channels = channels
+        self.activation = activation_function(activation)
+        self.block = self._configure_block()
+
+    def _configure_block(self) -> nn.Sequential:
+        block = []
+        for i in range(len(self.channels) - 1):
+            block += [
+                nn.Conv2d(
+                    self.channels[i], self.channels[i + 1], kernel_size=3, padding=1
+                ),
+                nn.BatchNorm2d(self.channels[i + 1]),
+                self.activation,
+            ]
+
+        return nn.Sequential(*block)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply the convolutional block."""
+        return self.block(x)
+
+
+class DownSamplingBlock(nn.Module):
+    """Basic down sampling block."""
+
+    def __init__(
+        self,
+        channels: List[int],
+        activation: str,
+        pooling_kernel: Union[int, bool] = 2,
+    ) -> None:
+        super().__init__()
+        self.conv_block = ConvBlock(channels, activation)
+        self.down_sampling = nn.MaxPool2d(pooling_kernel) if pooling_kernel else None
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Return the convolutional block output and a down sampled tensor."""
+        x = self.conv_block(x)
+        if self.down_sampling is not None:
+            x_down = self.down_sampling(x)
+        else:
+            x_down = None
+        return x_down, x
+
+
+class UpSamplingBlock(nn.Module):
+    """The upsampling block of the UNet."""
+
+    def __init__(
+        self, channels: List[int], activation: str, scale_factor: int = 2
+    ) -> None:
+        super().__init__()
+        self.conv_block = ConvBlock(channels, activation)
+        self.up_sampling = nn.Upsample(
+            scale_factor=scale_factor, mode="bilinear", align_corners=True
+        )
+
+    def forward(self, x: Tensor, x_skip: Optional[Tensor] = None) -> Tensor:
+        """Apply the up sampling and convolutional block."""
+        x = self.up_sampling(x)
+        if x_skip is not None:
+            x = torch.cat((x, x_skip), dim=1)
+        return self.conv_block(x)
+
+
+class UNet(nn.Module):
+    """UNet architecture."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        base_channels: int = 64,
+        num_classes: int = 3,
+        depth: int = 4,
+        out_channels: int = 3,
+        activation: str = "relu",
+        pooling_kernel: int = 2,
+        scale_factor: int = 2,
+    ) -> None:
+        super().__init__()
+        self.depth = depth
+        channels = [1] + [base_channels * 2 ** i for i in range(depth)]
+        self.encoder_blocks = self._configure_down_sampling_blocks(
+            channels, activation, pooling_kernel
+        )
+        self.decoder_blocks = self._configure_up_sampling_blocks(
+            channels, activation, scale_factor
+        )
+
+        self.head = nn.Conv2d(base_channels, num_classes, kernel_size=1)
+
+    def _configure_down_sampling_blocks(
+        self, channels: List[int], activation: str, pooling_kernel: int
+    ) -> nn.ModuleList:
+        blocks = nn.ModuleList([])
+        for i in range(len(channels) - 1):
+            pooling_kernel = pooling_kernel if i < self.depth - 1 else False
+            blocks += [
+                DownSamplingBlock(
+                    [channels[i], channels[i + 1], channels[i + 1]],
+                    activation,
+                    pooling_kernel,
+                )
+            ]
+
+        return blocks
+
+    def _configure_up_sampling_blocks(
+        self,
+        channels: List[int],
+        activation: str,
+        scale_factor: int,
+    ) -> nn.ModuleList:
+        channels.reverse()
+        return nn.ModuleList(
+            [
+                UpSamplingBlock(
+                    [channels[i] + channels[i + 1], channels[i + 1], channels[i + 1]],
+                    activation,
+                    scale_factor,
+                )
+                for i in range(len(channels) - 2)
+            ]
+        )
+
+    def encode(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]:
+        x_skips = []
+        for block in self.encoder_blocks:
+            x, x_skip = block(x)
+            if x_skip is not None:
+                x_skips.append(x_skip)
+        return x, x_skips
+
+    def decode(self, x: Tensor, x_skips: List[Tensor]) -> Tensor:
+        x = x_skips[-1]
+        for i, block in enumerate(self.decoder_blocks):
+            x = block(x, x_skips[-(i + 2)])
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        x, x_skips = self.encode(x)
+        x = self.decode(x, x_skips)
+        return self.head(x)