summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/notebooks/00-testing-stuff-out.ipynb2
-rw-r--r--src/notebooks/05a-UNet.ipynb335
-rw-r--r--src/text_recognizer/networks/__init__.py2
-rw-r--r--src/text_recognizer/networks/sparse_mlp.py78
-rw-r--r--src/text_recognizer/networks/unet.py64
5 files changed, 380 insertions, 101 deletions
diff --git a/src/notebooks/00-testing-stuff-out.ipynb b/src/notebooks/00-testing-stuff-out.ipynb
index 3686dcd..96a0c5a 100644
--- a/src/notebooks/00-testing-stuff-out.ipynb
+++ b/src/notebooks/00-testing-stuff-out.ipynb
@@ -1352,7 +1352,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.4"
+ "version": "3.8.2"
}
},
"nbformat": 4,
diff --git a/src/notebooks/05a-UNet.ipynb b/src/notebooks/05a-UNet.ipynb
new file mode 100644
index 0000000..c25865a
--- /dev/null
+++ b/src/notebooks/05a-UNet.ipynb
@@ -0,0 +1,335 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from PIL import Image\n",
+ "import torch\n",
+ "from torch import nn\n",
+ "from importlib.util import find_spec\n",
+ "if find_spec(\"text_recognizer\") is None:\n",
+ " import sys\n",
+ " sys.path.append('..')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x = 64\n",
+ "depth = 4\n",
+ "channels = [x * 2 ** i for i in range(4)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "channels.reverse()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[512, 256, 128, 64]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "channels"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "m = nn.ModuleList([nn.Conv2d(1,3,2), nn.Linear(1, 5)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ModuleAttributeError",
+ "evalue": "'ModuleList' object has no attribute 'reverse'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mModuleAttributeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-12-56d7987510bf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreverse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/text-recognizer-cxOiES-R-py3.8/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m raise ModuleAttributeError(\"'{}' object has no attribute '{}'\".format(\n\u001b[0m\u001b[1;32m 772\u001b[0m type(self).__name__, name))\n\u001b[1;32m 773\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mModuleAttributeError\u001b[0m: 'ModuleList' object has no attribute 'reverse'"
+ ]
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from text_recognizer.networks.unet import UNet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "net = UNet()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x = torch.rand(1, 1, 256, 256)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ModuleList(\n",
+ " (0): DownSamplingBlock(\n",
+ " (conv_block): ConvBlock(\n",
+ " (activation): ReLU(inplace=True)\n",
+ " (block): Sequential(\n",
+ " (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (2): ReLU(inplace=True)\n",
+ " (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (5): ReLU(inplace=True)\n",
+ " )\n",
+ " )\n",
+ " (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
+ " )\n",
+ " (1): DownSamplingBlock(\n",
+ " (conv_block): ConvBlock(\n",
+ " (activation): ReLU(inplace=True)\n",
+ " (block): Sequential(\n",
+ " (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (2): ReLU(inplace=True)\n",
+ " (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (5): ReLU(inplace=True)\n",
+ " )\n",
+ " )\n",
+ " (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
+ " )\n",
+ " (2): DownSamplingBlock(\n",
+ " (conv_block): ConvBlock(\n",
+ " (activation): ReLU(inplace=True)\n",
+ " (block): Sequential(\n",
+ " (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (2): ReLU(inplace=True)\n",
+ " (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (5): ReLU(inplace=True)\n",
+ " )\n",
+ " )\n",
+ " (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
+ " )\n",
+ " (3): DownSamplingBlock(\n",
+ " (conv_block): ConvBlock(\n",
+ " (activation): ReLU(inplace=True)\n",
+ " (block): Sequential(\n",
+ " (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (2): ReLU(inplace=True)\n",
+ " (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (5): ReLU(inplace=True)\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ ")"
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "net.encoder_blocks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ModuleList(\n",
+ " (0): UpSamplingBlock(\n",
+ " (conv_block): ConvBlock(\n",
+ " (activation): ReLU(inplace=True)\n",
+ " (block): Sequential(\n",
+ " (0): Conv2d(768, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (2): ReLU(inplace=True)\n",
+ " (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (5): ReLU(inplace=True)\n",
+ " )\n",
+ " )\n",
+ " (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n",
+ " )\n",
+ " (1): UpSamplingBlock(\n",
+ " (conv_block): ConvBlock(\n",
+ " (activation): ReLU(inplace=True)\n",
+ " (block): Sequential(\n",
+ " (0): Conv2d(384, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (2): ReLU(inplace=True)\n",
+ " (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (5): ReLU(inplace=True)\n",
+ " )\n",
+ " )\n",
+ " (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n",
+ " )\n",
+ " (2): UpSamplingBlock(\n",
+ " (conv_block): ConvBlock(\n",
+ " (activation): ReLU(inplace=True)\n",
+ " (block): Sequential(\n",
+ " (0): Conv2d(192, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (2): ReLU(inplace=True)\n",
+ " (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+ " (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+ " (5): ReLU(inplace=True)\n",
+ " )\n",
+ " )\n",
+ " (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n",
+ " )\n",
+ ")"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "net.decoder_blocks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Conv2d(64, 3, kernel_size=(1, 1), stride=(1, 1))"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "net.head"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 3, 256, 256])"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "net(x).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/text_recognizer/networks/__init__.py b/src/text_recognizer/networks/__init__.py
index 2cc1137..67e245c 100644
--- a/src/text_recognizer/networks/__init__.py
+++ b/src/text_recognizer/networks/__init__.py
@@ -6,7 +6,6 @@ from .densenet import DenseNet
from .lenet import LeNet
from .mlp import MLP
from .residual_network import ResidualNetwork, ResidualNetworkEncoder
-from .sparse_mlp import SparseMLP
from .transformer import Transformer
from .util import sliding_window
from .wide_resnet import WideResidualNetwork
@@ -22,6 +21,5 @@ __all__ = [
"ResidualNetworkEncoder",
"sliding_window",
"Transformer",
- "SparseMLP",
"WideResidualNetwork",
]
diff --git a/src/text_recognizer/networks/sparse_mlp.py b/src/text_recognizer/networks/sparse_mlp.py
deleted file mode 100644
index 53cf166..0000000
--- a/src/text_recognizer/networks/sparse_mlp.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Defines the Sparse MLP network."""
-from typing import Callable, Dict, List, Optional, Union
-import warnings
-
-from einops.layers.torch import Rearrange
-from pytorch_block_sparse import BlockSparseLinear
-import torch
-from torch import nn
-
-from text_recognizer.networks.util import activation_function
-
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-
-
-class SparseMLP(nn.Module):
- """Sparse multi layered perceptron network."""
-
- def __init__(
- self,
- input_size: int = 784,
- num_classes: int = 10,
- hidden_size: Union[int, List] = 128,
- num_layers: int = 3,
- density: float = 0.1,
- activation_fn: str = "relu",
- ) -> None:
- """Initialization of the MLP network.
-
- Args:
- input_size (int): The input shape of the network. Defaults to 784.
- num_classes (int): Number of classes in the dataset. Defaults to 10.
- hidden_size (Union[int, List]): The number of `neurons` in each hidden layer. Defaults to 128.
- num_layers (int): The number of hidden layers. Defaults to 3.
- density (float): The density of activation at each layer. Default to 0.1.
- activation_fn (str): Name of the activation function in the hidden layers. Defaults to
- relu.
-
- """
- super().__init__()
-
- activation_fn = activation_function(activation_fn)
-
- if isinstance(hidden_size, int):
- hidden_size = [hidden_size] * num_layers
-
- self.layers = [
- Rearrange("b c h w -> b (c h w)"),
- nn.Linear(in_features=input_size, out_features=hidden_size[0]),
- activation_fn,
- ]
-
- for i in range(num_layers - 1):
- self.layers += [
- BlockSparseLinear(
- in_features=hidden_size[i],
- out_features=hidden_size[i + 1],
- density=density,
- ),
- activation_fn,
- ]
-
- self.layers.append(
- nn.Linear(in_features=hidden_size[-1], out_features=num_classes)
- )
-
- self.layers = nn.Sequential(*self.layers)
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- """The feedforward pass."""
- # If batch dimenstion is missing, it needs to be added.
- if len(x.shape) < 4:
- x = x[(None,) * (4 - len(x.shape))]
- return self.layers(x)
-
- @property
- def __name__(self) -> str:
- """Returns the name of the network."""
- return "mlp"
diff --git a/src/text_recognizer/networks/unet.py b/src/text_recognizer/networks/unet.py
index eb4188b..51f242a 100644
--- a/src/text_recognizer/networks/unet.py
+++ b/src/text_recognizer/networks/unet.py
@@ -1,5 +1,5 @@
"""UNet for segmentation."""
-from typing import List, Tuple
+from typing import List, Optional, Tuple, Union
import torch
from torch import nn
@@ -39,16 +39,23 @@ class DownSamplingBlock(nn.Module):
"""Basic down sampling block."""
def __init__(
- self, channels: List[int], activation: str, pooling_kernel: int = 2
+ self,
+ channels: List[int],
+ activation: str,
+ pooling_kernel: Union[int, bool] = 2,
) -> None:
super().__init__()
self.conv_block = ConvBlock(channels, activation)
- self.down_sampling = nn.MaxPool2d(pooling_kernel)
+ self.down_sampling = nn.MaxPool2d(pooling_kernel) if pooling_kernel else None
def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
"""Return the convolutional block output and a down sampled tensor."""
x = self.conv_block(x)
- return self.down_sampling(x), x
+ if self.down_sampling is not None:
+ x_down = self.down_sampling(x)
+ else:
+ x_down = None
+ return x_down, x
class UpSamplingBlock(nn.Module):
@@ -63,10 +70,11 @@ class UpSamplingBlock(nn.Module):
scale_factor=scale_factor, mode="bilinear", align_corners=True
)
- def forward(self, x: Tensor, x_skip: Tensor) -> Tensor:
+ def forward(self, x: Tensor, x_skip: Optional[Tensor] = None) -> Tensor:
"""Apply the up sampling and convolutional block."""
x = self.up_sampling(x)
- x = torch.cat((x, x_skip), dim=1)
+ if x_skip is not None:
+ x = torch.cat((x, x_skip), dim=1)
return self.conv_block(x)
@@ -77,6 +85,7 @@ class UNet(nn.Module):
self,
in_channels: int = 1,
base_channels: int = 64,
+ num_classes: int = 3,
depth: int = 4,
out_channels: int = 3,
activation: str = "relu",
@@ -84,27 +93,32 @@ class UNet(nn.Module):
scale_factor: int = 2,
) -> None:
super().__init__()
- channels = [base_channels * 2 ** i for i in range(depth)]
- self.down_sampling_blocks = self._configure_down_sampling_blocks(
+ self.depth = depth
+ channels = [1] + [base_channels * 2 ** i for i in range(depth)]
+ self.encoder_blocks = self._configure_down_sampling_blocks(
channels, activation, pooling_kernel
)
- self.up_sampling_blocks = self._configure_up_sampling_blocks(
+ self.decoder_blocks = self._configure_up_sampling_blocks(
channels, activation, scale_factor
)
+ self.head = nn.Conv2d(base_channels, num_classes, kernel_size=1)
+
def _configure_down_sampling_blocks(
self, channels: List[int], activation: str, pooling_kernel: int
) -> nn.ModuleList:
- return nn.ModuleList(
- [
+ blocks = nn.ModuleList([])
+ for i in range(len(channels) - 1):
+ pooling_kernel = pooling_kernel if i < self.depth - 1 else False
+ blocks += [
DownSamplingBlock(
[channels[i], channels[i + 1], channels[i + 1]],
activation,
pooling_kernel,
)
- for i in range(len(channels))
]
- )
+
+ return blocks
def _configure_up_sampling_blocks(
self,
@@ -112,23 +126,33 @@ class UNet(nn.Module):
activation: str,
scale_factor: int,
) -> nn.ModuleList:
+ channels.reverse()
return nn.ModuleList(
[
UpSamplingBlock(
- [channels[i], channels[i + 1], channels[i + 1]],
+ [channels[i] + channels[i + 1], channels[i + 1], channels[i + 1]],
activation,
scale_factor,
)
+ for i in range(len(channels) - 2)
]
- for i in range(len(channels))
)
- def down_sampling(self, x: Tensor) -> List[Tensor]:
+ def encode(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]:
x_skips = []
- for block in self.down_sampling_blocks:
+ for block in self.encoder_blocks:
x, x_skip = block(x)
- x_skips.append(x_skip)
+ if x_skip is not None:
+ x_skips.append(x_skip)
return x, x_skips
- def up_sampling(self, x: Tensor, x_skips: List[Tensor]) -> Tensor:
- pass
+ def decode(self, x: Tensor, x_skips: List[Tensor]) -> Tensor:
+ x = x_skips[-1]
+ for i, block in enumerate(self.decoder_blocks):
+ x = block(x, x_skips[-(i + 2)])
+ return x
+
+ def forward(self, x: Tensor) -> Tensor:
+ x, x_skips = self.encode(x)
+ x = self.decode(x, x_skips)
+ return self.head(x)