From 73ae250d7993fa48eccff4042ecd6bf768650bf3 Mon Sep 17 00:00:00 2001 From: aktersnurra Date: Wed, 18 Nov 2020 23:35:35 +0100 Subject: UNet implemented. --- src/notebooks/00-testing-stuff-out.ipynb | 2 +- src/notebooks/05a-UNet.ipynb | 335 +++++++++++++++++++++++++++++ src/text_recognizer/networks/__init__.py | 2 - src/text_recognizer/networks/sparse_mlp.py | 78 ------- src/text_recognizer/networks/unet.py | 64 ++++-- 5 files changed, 380 insertions(+), 101 deletions(-) create mode 100644 src/notebooks/05a-UNet.ipynb delete mode 100644 src/text_recognizer/networks/sparse_mlp.py (limited to 'src') diff --git a/src/notebooks/00-testing-stuff-out.ipynb b/src/notebooks/00-testing-stuff-out.ipynb index 3686dcd..96a0c5a 100644 --- a/src/notebooks/00-testing-stuff-out.ipynb +++ b/src/notebooks/00-testing-stuff-out.ipynb @@ -1352,7 +1352,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.8.2" } }, "nbformat": 4, diff --git a/src/notebooks/05a-UNet.ipynb b/src/notebooks/05a-UNet.ipynb new file mode 100644 index 0000000..c25865a --- /dev/null +++ b/src/notebooks/05a-UNet.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from PIL import Image\n", + "import torch\n", + "from torch import nn\n", + "from importlib.util import find_spec\n", + "if find_spec(\"text_recognizer\") is None:\n", + " import sys\n", + " sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "x = 64\n", + "depth = 4\n", + "channels = [x * 2 ** i for i in range(4)]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "channels.reverse()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[512, 256, 128, 64]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "channels" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "m = nn.ModuleList([nn.Conv2d(1,3,2), nn.Linear(1, 5)])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleAttributeError", + "evalue": "'ModuleList' object has no attribute 'reverse'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreverse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/text-recognizer-cxOiES-R-py3.8/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m raise ModuleAttributeError(\"'{}' object has no attribute '{}'\".format(\n\u001b[0m\u001b[1;32m 772\u001b[0m type(self).__name__, name))\n\u001b[1;32m 773\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleAttributeError\u001b[0m: 'ModuleList' object has no attribute 'reverse'" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "from text_recognizer.networks.unet import UNet" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "net = UNet()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "x = torch.rand(1, 1, 256, 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ModuleList(\n", + " (0): DownSamplingBlock(\n", + " (conv_block): ConvBlock(\n", + " (activation): ReLU(inplace=True)\n", + " (block): Sequential(\n", + " (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " )\n", + " (1): DownSamplingBlock(\n", + " (conv_block): ConvBlock(\n", + " (activation): ReLU(inplace=True)\n", + " (block): Sequential(\n", + " (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " )\n", + " (2): DownSamplingBlock(\n", + " (conv_block): ConvBlock(\n", + " (activation): ReLU(inplace=True)\n", + " (block): Sequential(\n", + " (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (down_sampling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " )\n", + " (3): DownSamplingBlock(\n", + " (conv_block): ConvBlock(\n", + " (activation): ReLU(inplace=True)\n", + " (block): Sequential(\n", + " (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): ReLU(inplace=True)\n", + " )\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "net.encoder_blocks" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ModuleList(\n", + " (0): UpSamplingBlock(\n", + " (conv_block): ConvBlock(\n", + " (activation): ReLU(inplace=True)\n", + " (block): Sequential(\n", + " (0): Conv2d(768, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n", + " )\n", + " (1): UpSamplingBlock(\n", + " (conv_block): ConvBlock(\n", + " (activation): ReLU(inplace=True)\n", + " (block): Sequential(\n", + " (0): Conv2d(384, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n", + " )\n", + " (2): UpSamplingBlock(\n", + " (conv_block): ConvBlock(\n", + " (activation): ReLU(inplace=True)\n", + " (block): Sequential(\n", + " (0): Conv2d(192, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): ReLU(inplace=True)\n", + " )\n", + " )\n", + " (up_sampling): Upsample(scale_factor=2.0, mode=bilinear)\n", + " )\n", + ")" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "net.decoder_blocks" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Conv2d(64, 3, kernel_size=(1, 1), stride=(1, 1))" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "net.head" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 3, 256, 256])" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "net(x).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/text_recognizer/networks/__init__.py b/src/text_recognizer/networks/__init__.py index 2cc1137..67e245c 100644 --- a/src/text_recognizer/networks/__init__.py +++ b/src/text_recognizer/networks/__init__.py @@ -6,7 +6,6 @@ from .densenet import DenseNet from .lenet import LeNet from .mlp import MLP from .residual_network import ResidualNetwork, ResidualNetworkEncoder -from .sparse_mlp import SparseMLP from .transformer import Transformer from .util import sliding_window from .wide_resnet import WideResidualNetwork @@ -22,6 +21,5 @@ __all__ = [ "ResidualNetworkEncoder", "sliding_window", "Transformer", - "SparseMLP", "WideResidualNetwork", ] diff --git a/src/text_recognizer/networks/sparse_mlp.py b/src/text_recognizer/networks/sparse_mlp.py deleted file mode 100644 index 53cf166..0000000 --- a/src/text_recognizer/networks/sparse_mlp.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Defines the Sparse MLP network.""" -from typing import Callable, Dict, List, Optional, Union -import warnings - -from einops.layers.torch import Rearrange -from pytorch_block_sparse import BlockSparseLinear -import torch -from torch import nn - -from text_recognizer.networks.util import activation_function - -warnings.filterwarnings("ignore", category=DeprecationWarning) - - -class SparseMLP(nn.Module): - """Sparse multi layered perceptron network.""" - - def __init__( - self, - input_size: int = 784, - num_classes: int = 10, - hidden_size: Union[int, List] = 128, - num_layers: int = 3, - density: float = 0.1, - activation_fn: str = "relu", - ) -> None: - """Initialization of the MLP network. - - Args: - input_size (int): The input shape of the network. Defaults to 784. - num_classes (int): Number of classes in the dataset. Defaults to 10. - hidden_size (Union[int, List]): The number of `neurons` in each hidden layer. Defaults to 128. - num_layers (int): The number of hidden layers. Defaults to 3. - density (float): The density of activation at each layer. Default to 0.1. - activation_fn (str): Name of the activation function in the hidden layers. Defaults to - relu. - - """ - super().__init__() - - activation_fn = activation_function(activation_fn) - - if isinstance(hidden_size, int): - hidden_size = [hidden_size] * num_layers - - self.layers = [ - Rearrange("b c h w -> b (c h w)"), - nn.Linear(in_features=input_size, out_features=hidden_size[0]), - activation_fn, - ] - - for i in range(num_layers - 1): - self.layers += [ - BlockSparseLinear( - in_features=hidden_size[i], - out_features=hidden_size[i + 1], - density=density, - ), - activation_fn, - ] - - self.layers.append( - nn.Linear(in_features=hidden_size[-1], out_features=num_classes) - ) - - self.layers = nn.Sequential(*self.layers) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """The feedforward pass.""" - # If batch dimenstion is missing, it needs to be added. - if len(x.shape) < 4: - x = x[(None,) * (4 - len(x.shape))] - return self.layers(x) - - @property - def __name__(self) -> str: - """Returns the name of the network.""" - return "mlp" diff --git a/src/text_recognizer/networks/unet.py b/src/text_recognizer/networks/unet.py index eb4188b..51f242a 100644 --- a/src/text_recognizer/networks/unet.py +++ b/src/text_recognizer/networks/unet.py @@ -1,5 +1,5 @@ """UNet for segmentation.""" -from typing import List, Tuple +from typing import List, Optional, Tuple, Union import torch from torch import nn @@ -39,16 +39,23 @@ class DownSamplingBlock(nn.Module): """Basic down sampling block.""" def __init__( - self, channels: List[int], activation: str, pooling_kernel: int = 2 + self, + channels: List[int], + activation: str, + pooling_kernel: Union[int, bool] = 2, ) -> None: super().__init__() self.conv_block = ConvBlock(channels, activation) - self.down_sampling = nn.MaxPool2d(pooling_kernel) + self.down_sampling = nn.MaxPool2d(pooling_kernel) if pooling_kernel else None def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]: """Return the convolutional block output and a down sampled tensor.""" x = self.conv_block(x) - return self.down_sampling(x), x + if self.down_sampling is not None: + x_down = self.down_sampling(x) + else: + x_down = None + return x_down, x class UpSamplingBlock(nn.Module): @@ -63,10 +70,11 @@ class UpSamplingBlock(nn.Module): scale_factor=scale_factor, mode="bilinear", align_corners=True ) - def forward(self, x: Tensor, x_skip: Tensor) -> Tensor: + def forward(self, x: Tensor, x_skip: Optional[Tensor] = None) -> Tensor: """Apply the up sampling and convolutional block.""" x = self.up_sampling(x) - x = torch.cat((x, x_skip), dim=1) + if x_skip is not None: + x = torch.cat((x, x_skip), dim=1) return self.conv_block(x) @@ -77,6 +85,7 @@ class UNet(nn.Module): self, in_channels: int = 1, base_channels: int = 64, + num_classes: int = 3, depth: int = 4, out_channels: int = 3, activation: str = "relu", @@ -84,27 +93,32 @@ class UNet(nn.Module): scale_factor: int = 2, ) -> None: super().__init__() - channels = [base_channels * 2 ** i for i in range(depth)] - self.down_sampling_blocks = self._configure_down_sampling_blocks( + self.depth = depth + channels = [1] + [base_channels * 2 ** i for i in range(depth)] + self.encoder_blocks = self._configure_down_sampling_blocks( channels, activation, pooling_kernel ) - self.up_sampling_blocks = self._configure_up_sampling_blocks( + self.decoder_blocks = self._configure_up_sampling_blocks( channels, activation, scale_factor ) + self.head = nn.Conv2d(base_channels, num_classes, kernel_size=1) + def _configure_down_sampling_blocks( self, channels: List[int], activation: str, pooling_kernel: int ) -> nn.ModuleList: - return nn.ModuleList( - [ + blocks = nn.ModuleList([]) + for i in range(len(channels) - 1): + pooling_kernel = pooling_kernel if i < self.depth - 1 else False + blocks += [ DownSamplingBlock( [channels[i], channels[i + 1], channels[i + 1]], activation, pooling_kernel, ) - for i in range(len(channels)) ] - ) + + return blocks def _configure_up_sampling_blocks( self, @@ -112,23 +126,33 @@ class UNet(nn.Module): activation: str, scale_factor: int, ) -> nn.ModuleList: + channels.reverse() return nn.ModuleList( [ UpSamplingBlock( - [channels[i], channels[i + 1], channels[i + 1]], + [channels[i] + channels[i + 1], channels[i + 1], channels[i + 1]], activation, scale_factor, ) + for i in range(len(channels) - 2) ] - for i in range(len(channels)) ) - def down_sampling(self, x: Tensor) -> List[Tensor]: + def encode(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]: x_skips = [] - for block in self.down_sampling_blocks: + for block in self.encoder_blocks: x, x_skip = block(x) - x_skips.append(x_skip) + if x_skip is not None: + x_skips.append(x_skip) return x, x_skips - def up_sampling(self, x: Tensor, x_skips: List[Tensor]) -> Tensor: - pass + def decode(self, x: Tensor, x_skips: List[Tensor]) -> Tensor: + x = x_skips[-1] + for i, block in enumerate(self.decoder_blocks): + x = block(x, x_skips[-(i + 2)]) + return x + + def forward(self, x: Tensor) -> Tensor: + x, x_skips = self.encode(x) + x = self.decode(x, x_skips) + return self.head(x) -- cgit v1.2.3-70-g09d2