diff options
-rw-r--r-- | notebooks/00-scratch-pad.ipynb | 198 | ||||
-rw-r--r-- | text_recognizer/networks/encoders/efficientnet/efficientnet.py | 68 | ||||
-rw-r--r-- | text_recognizer/networks/encoders/efficientnet/mbconv.py | 64 | ||||
-rw-r--r-- | text_recognizer/networks/encoders/efficientnet/utils.py | 107 |
4 files changed, 243 insertions, 194 deletions
diff --git a/notebooks/00-scratch-pad.ipynb b/notebooks/00-scratch-pad.ipynb index 1816021..32f96d1 100644 --- a/notebooks/00-scratch-pad.ipynb +++ b/notebooks/00-scratch-pad.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", @@ -39,13 +30,180 @@ }, { "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from text_recognizer.networks.encoders.efficientnet.efficientnet import EfficientNet" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "en = EfficientNet(\"b0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==========================================================================================\n", + "Layer (type:depth-idx) Output Shape Param #\n", + "==========================================================================================\n", + "├─Sequential: 1-1 [-1, 32, 111, 111] --\n", + "| └─Conv2d: 2-1 [-1, 32, 111, 111] 288\n", + "| └─BatchNorm2d: 2-2 [-1, 32, 111, 111] 64\n", + "| └─Mish: 2-3 [-1, 32, 111, 111] --\n", + "├─ModuleList: 1 [] --\n", + "| └─MBConvBlock: 2-4 [-1, 16, 111, 111] --\n", + "| | └─Sequential: 3-1 [-1, 32, 111, 111] 352\n", + "| | └─Sequential: 3-2 [-1, 32, 111, 111] 552\n", + "| | └─Sequential: 3-3 [-1, 16, 111, 111] 544\n", + "| └─MBConvBlock: 2-5 [-1, 24, 55, 55] --\n", + "| | └─Sequential: 3-4 [-1, 96, 111, 111] 1,728\n", + "| | └─Sequential: 3-5 [-1, 96, 55, 55] 1,056\n", + "| | └─Sequential: 3-6 [-1, 96, 55, 55] 4,728\n", + "| | └─Sequential: 3-7 [-1, 24, 55, 55] 2,352\n", + "| └─MBConvBlock: 2-6 [-1, 24, 55, 55] --\n", + "| | └─Sequential: 3-8 [-1, 144, 55, 55] 3,744\n", + "| | └─Sequential: 3-9 [-1, 144, 55, 55] 1,584\n", + "| | └─Sequential: 3-10 [-1, 144, 55, 55] 10,548\n", + "| | └─Sequential: 3-11 [-1, 24, 55, 55] 3,504\n", + "| └─MBConvBlock: 2-7 [-1, 40, 27, 27] --\n", + "| | └─Sequential: 3-12 [-1, 144, 55, 55] 3,744\n", + "| | └─Sequential: 3-13 [-1, 144, 27, 27] 3,888\n", + "| | └─Sequential: 3-14 [-1, 144, 27, 27] 10,548\n", + "| | └─Sequential: 3-15 [-1, 40, 27, 27] 5,840\n", + "| └─MBConvBlock: 2-8 [-1, 40, 27, 27] --\n", + "| | └─Sequential: 3-16 [-1, 240, 27, 27] 10,080\n", + "| | └─Sequential: 3-17 [-1, 240, 27, 27] 6,480\n", + "| | └─Sequential: 3-18 [-1, 240, 27, 27] 29,100\n", + "| | └─Sequential: 3-19 [-1, 40, 27, 27] 9,680\n", + "| └─MBConvBlock: 2-9 [-1, 80, 13, 13] --\n", + "| | └─Sequential: 3-20 [-1, 240, 27, 27] 10,080\n", + "| | └─Sequential: 3-21 [-1, 240, 13, 13] 2,640\n", + "| | └─Sequential: 3-22 [-1, 240, 13, 13] 29,100\n", + "| | └─Sequential: 3-23 [-1, 80, 13, 13] 19,360\n", + "| └─MBConvBlock: 2-10 [-1, 80, 13, 13] --\n", + "| | └─Sequential: 3-24 [-1, 480, 13, 13] 39,360\n", + "| | └─Sequential: 3-25 [-1, 480, 13, 13] 5,280\n", + "| | └─Sequential: 3-26 [-1, 480, 13, 13] 115,800\n", + "| | └─Sequential: 3-27 [-1, 80, 13, 13] 38,560\n", + "| └─MBConvBlock: 2-11 [-1, 80, 13, 13] --\n", + "| | └─Sequential: 3-28 [-1, 480, 13, 13] 39,360\n", + "| | └─Sequential: 3-29 [-1, 480, 13, 13] 5,280\n", + "| | └─Sequential: 3-30 [-1, 480, 13, 13] 115,800\n", + "| | └─Sequential: 3-31 [-1, 80, 13, 13] 38,560\n", + "| └─MBConvBlock: 2-12 [-1, 112, 13, 13] --\n", + "| | └─Sequential: 3-32 [-1, 480, 13, 13] 39,360\n", + "| | └─Sequential: 3-33 [-1, 480, 13, 13] 12,960\n", + "| | └─Sequential: 3-34 [-1, 480, 13, 13] 115,800\n", + "| | └─Sequential: 3-35 [-1, 112, 13, 13] 53,984\n", + "| └─MBConvBlock: 2-13 [-1, 112, 13, 13] --\n", + "| | └─Sequential: 3-36 [-1, 672, 13, 13] 76,608\n", + "| | └─Sequential: 3-37 [-1, 672, 13, 13] 18,144\n", + "| | └─Sequential: 3-38 [-1, 672, 13, 13] 226,632\n", + "| | └─Sequential: 3-39 [-1, 112, 13, 13] 75,488\n", + "| └─MBConvBlock: 2-14 [-1, 112, 13, 13] --\n", + "| | └─Sequential: 3-40 [-1, 672, 13, 13] 76,608\n", + "| | └─Sequential: 3-41 [-1, 672, 13, 13] 18,144\n", + "| | └─Sequential: 3-42 [-1, 672, 13, 13] 226,632\n", + "| | └─Sequential: 3-43 [-1, 112, 13, 13] 75,488\n", + "| └─MBConvBlock: 2-15 [-1, 192, 6, 6] --\n", + "| | └─Sequential: 3-44 [-1, 672, 13, 13] 76,608\n", + "| | └─Sequential: 3-45 [-1, 672, 6, 6] 18,144\n", + "| | └─Sequential: 3-46 [-1, 672, 6, 6] 226,632\n", + "| | └─Sequential: 3-47 [-1, 192, 6, 6] 129,408\n", + "| └─MBConvBlock: 2-16 [-1, 192, 6, 6] --\n", + "| | └─Sequential: 3-48 [-1, 1152, 6, 6] 223,488\n", + "| | └─Sequential: 3-49 [-1, 1152, 6, 6] 31,104\n", + "| | └─Sequential: 3-50 [-1, 1152, 6, 6] 664,992\n", + "| | └─Sequential: 3-51 [-1, 192, 6, 6] 221,568\n", + "| └─MBConvBlock: 2-17 [-1, 192, 6, 6] --\n", + "| | └─Sequential: 3-52 [-1, 1152, 6, 6] 223,488\n", + "| | └─Sequential: 3-53 [-1, 1152, 6, 6] 31,104\n", + "| | └─Sequential: 3-54 [-1, 1152, 6, 6] 664,992\n", + "| | └─Sequential: 3-55 [-1, 192, 6, 6] 221,568\n", + "| └─MBConvBlock: 2-18 [-1, 192, 6, 6] --\n", + "| | └─Sequential: 3-56 [-1, 1152, 6, 6] 223,488\n", + "| | └─Sequential: 3-57 [-1, 1152, 6, 6] 31,104\n", + "| | └─Sequential: 3-58 [-1, 1152, 6, 6] 664,992\n", + "| | └─Sequential: 3-59 [-1, 192, 6, 6] 221,568\n", + "| └─MBConvBlock: 2-19 [-1, 320, 6, 6] --\n", + "| | └─Sequential: 3-60 [-1, 1152, 6, 6] 223,488\n", + "| | └─Sequential: 3-61 [-1, 1152, 6, 6] 12,672\n", + "| | └─Sequential: 3-62 [-1, 1152, 6, 6] 664,992\n", + "| | └─Sequential: 3-63 [-1, 320, 6, 6] 369,280\n", + "├─Sequential: 1-2 [-1, 1280, 6, 6] --\n", + "| └─Conv2d: 2-20 [-1, 1280, 6, 6] 409,600\n", + "| └─BatchNorm2d: 2-21 [-1, 1280, 6, 6] 2,560\n", + "==========================================================================================\n", + "Total params: 7,142,272\n", + "Trainable params: 7,142,272\n", + "Non-trainable params: 0\n", + "Total mult-adds (M): 657.05\n", + "==========================================================================================\n", + "Input size (MB): 0.19\n", + "Forward/backward pass size (MB): 115.14\n", + "Params size (MB): 27.25\n", + "Estimated Total Size (MB): 142.58\n", + "==========================================================================================\n" + ] + } + ], + "source": [ + "summary(en, (1, 224, 224));" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2, 2)" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(2,) * 2" + ] + }, + { + "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/aktersnurra/.cache/pypoetry/virtualenvs/text-recognizer-ejNaVa9M-py3.9/lib/python3.9/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:115.)\n", + " return torch._C._cuda_getDeviceCount() > 0\n" + ] + }, + { "data": { "text/plain": [ - "True" + "False" ] }, "execution_count": 4, @@ -195,21 +353,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'text_recognizer.networks.encoders.efficientnet.efficientnet'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-6-c209d858dfb1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtext_recognizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnetworks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoders\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mefficientnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mefficientnet\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mEfficientNet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'text_recognizer.networks.encoders.efficientnet.efficientnet'" - ] - } - ], + "outputs": [], "source": [ "from text_recognizer.networks.encoders.efficientnet.efficientnet import EfficientNet" ] diff --git a/text_recognizer/networks/encoders/efficientnet/efficientnet.py b/text_recognizer/networks/encoders/efficientnet/efficientnet.py index 98d58fd..b527d90 100644 --- a/text_recognizer/networks/encoders/efficientnet/efficientnet.py +++ b/text_recognizer/networks/encoders/efficientnet/efficientnet.py @@ -1,13 +1,9 @@ """Efficient net.""" -from typing import Tuple - from torch import nn, Tensor from .mbconv import MBConvBlock from .utils import ( block_args, - calculate_output_image_size, - get_same_padding_conv2d, round_filters, round_repeats, ) @@ -28,11 +24,19 @@ class EfficientNet(nn.Module): "l2": (4.3, 5.3, 0.5), } - def __init__(self, arch: str, image_size: Tuple[int, int]) -> None: + def __init__( + self, + arch: str, + stochastic_dropout_rate: float = 0.2, + bn_momentum: float = 0.99, + bn_eps: float = 1.0e-3, + ) -> None: super().__init__() assert arch in self.archs, f"{arch} not a valid efficient net architecure!" self.arch = self.archs[arch] - self.image_size = image_size + self.stochastic_dropout_rate = stochastic_dropout_rate + self.bn_momentum = 1 - bn_momentum + self.bn_eps = bn_eps self._conv_stem: nn.Sequential = None self._blocks: nn.Sequential = None self._conv_head: nn.Sequential = None @@ -42,57 +46,53 @@ class EfficientNet(nn.Module): _block_args = block_args() in_channels = 1 # BW out_channels = round_filters(32, self.arch) - Conv2d = get_same_padding_conv2d(image_size=self.image_size) self._conv_stem = nn.Sequential( - Conv2d( + nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=3, - stride=2, + stride=(2, 2), bias=False, ), - nn.BatchNorm2d(num_features=out_channels, momentum=bn_momentum, eps=bn_eps), + nn.BatchNorm2d( + num_features=out_channels, momentum=self.bn_momentum, eps=self.bn_eps + ), nn.Mish(inplace=True), ) - image_size = calculate_output_image_size(self.image_size, 2) self._blocks = nn.ModuleList([]) for args in _block_args: args.in_channels = round_filters(args.in_channels, self.arch) args.out_channels = round_filters(args.out_channels, self.arch) - args.num_repeat = round_repeats(args.num_repeat, self.arch) - - self._blocks.append( - MBConvBlock( - **args, - bn_momentum=bn_momentum, - bn_eps=bn_eps, - image_size=image_size, - ) - ) - image_size = calculate_output_image_size(image_size, args.stride) - if args.num_repeat > 1: - args.in_channels = args.out_channels - args.stride = 1 - for _ in range(args.num_repeat - 1): + args.num_repeats = round_repeats(args.num_repeats, self.arch) + for _ in range(args.num_repeats): self._blocks.append( MBConvBlock( **args, - bn_momentum=bn_momentum, - bn_eps=bn_eps, - image_size=image_size, + bn_momentum=self.bn_momentum, + bn_eps=self.bn_eps, ) ) + args.in_channels = args.out_channels + args.stride = 1 - in_channels = args.out_channels + in_channels = round_filters(320, self.arch) out_channels = round_filters(1280, self.arch) - Conv2d = get_same_padding_conv2d(image_size=image_size) self._conv_head = nn.Sequential( - Conv2d(in_channels, out_channels, kernel_size=1, bias=False), - nn.BatchNorm2d(num_features=out_channels, momentum=bn_momentum, eps=bn_eps), + nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d( + num_features=out_channels, momentum=self.bn_momentum, eps=self.bn_eps + ), ) def extract_features(self, x: Tensor) -> Tensor: x = self._conv_stem(x) + for i, block in enumerate(self._blocks): + stochastic_dropout_rate = self.stochastic_dropout_rate + if self.stochastic_dropout_rate: + stochastic_dropout_rate *= i / len(self._blocks) + x = block(x, stochastic_dropout_rate=stochastic_dropout_rate) + self._conv_head(x) + return x def forward(self, x: Tensor) -> Tensor: - pass + return self.extract_features(x) diff --git a/text_recognizer/networks/encoders/efficientnet/mbconv.py b/text_recognizer/networks/encoders/efficientnet/mbconv.py index fbb3f22..e43771a 100644 --- a/text_recognizer/networks/encoders/efficientnet/mbconv.py +++ b/text_recognizer/networks/encoders/efficientnet/mbconv.py @@ -1,11 +1,11 @@ """Mobile inverted residual block.""" -from typing import Any, Optional, Tuple +from typing import Any, Optional, Union, Tuple import torch from torch import nn, Tensor -from torch.nn import functional as F +import torch.nn.functional as F -from .utils import calculate_output_image_size, drop_connection, get_same_padding_conv2d +from .utils import stochastic_depth class MBConvBlock(nn.Module): @@ -16,22 +16,30 @@ class MBConvBlock(nn.Module): in_channels: int, out_channels: int, kernel_size: int, - stride: int, + stride: Union[Tuple[int, int], int], bn_momentum: float, bn_eps: float, se_ratio: float, expand_ratio: int, - image_size: Optional[Tuple[int, int]], *args: Any, **kwargs: Any, ) -> None: super().__init__() self.kernel_size = kernel_size + self.stride = (stride, ) * 2 if isinstance(stride, int) else stride self.bn_momentum = bn_momentum self.bn_eps = bn_eps - self.in_channels = self.in_channels + self.in_channels = in_channels self.out_channels = out_channels + if self.stride == (2, 2): + self.pad = [ + (self.kernel_size - 1) // 2 - 1, + (self.kernel_size - 1) // 2, + ] * 2 + else: + self.pad = [(self.kernel_size - 1) // 2] * 4 + # Placeholders for layers. self._inverted_bottleneck: nn.Sequential = None self._depthwise: nn.Sequential = None @@ -39,7 +47,6 @@ class MBConvBlock(nn.Module): self._pointwise: nn.Sequential = None self._build( - image_size=image_size, in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, @@ -50,11 +57,10 @@ class MBConvBlock(nn.Module): def _build( self, - image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int, kernel_size: int, - stride: int, + stride: Union[Tuple[int, int], int], expand_ratio: int, se_ratio: float, ) -> None: @@ -62,7 +68,6 @@ class MBConvBlock(nn.Module): inner_channels = in_channels * expand_ratio self._inverted_bottleneck = ( self._configure_inverted_bottleneck( - image_size=image_size, in_channels=in_channels, out_channels=inner_channels, ) @@ -71,7 +76,6 @@ class MBConvBlock(nn.Module): ) self._depthwise = self._configure_depthwise( - image_size=image_size, in_channels=inner_channels, out_channels=inner_channels, groups=inner_channels, @@ -79,7 +83,6 @@ class MBConvBlock(nn.Module): stride=stride, ) - image_size = calculate_output_image_size(image_size, stride) self._squeeze_excite = ( self._configure_squeeze_excite( in_channels=inner_channels, @@ -91,19 +94,17 @@ class MBConvBlock(nn.Module): ) self._pointwise = self._configure_pointwise( - image_size=image_size, in_channels=inner_channels, out_channels=out_channels + in_channels=inner_channels, out_channels=out_channels ) def _configure_inverted_bottleneck( self, - image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int, ) -> nn.Sequential: """Expansion phase.""" - Conv2d = get_same_padding_conv2d(image_size=image_size) return nn.Sequential( - Conv2d( + nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=1, @@ -117,16 +118,14 @@ class MBConvBlock(nn.Module): def _configure_depthwise( self, - image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int, groups: int, kernel_size: int, - stride: int, + stride: Union[Tuple[int, int], int], ) -> nn.Sequential: - Conv2d = get_same_padding_conv2d(image_size=image_size) return nn.Sequential( - Conv2d( + nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, @@ -143,16 +142,15 @@ class MBConvBlock(nn.Module): def _configure_squeeze_excite( self, in_channels: int, out_channels: int, se_ratio: float ) -> nn.Sequential: - Conv2d = get_same_padding_conv2d(image_size=(1, 1)) num_squeezed_channels = max(1, int(in_channels * se_ratio)) return nn.Sequential( - Conv2d( + nn.Conv2d( in_channels=in_channels, out_channels=num_squeezed_channels, kernel_size=1, ), nn.Mish(inplace=True), - Conv2d( + nn.Conv2d( in_channels=num_squeezed_channels, out_channels=out_channels, kernel_size=1, @@ -160,11 +158,10 @@ class MBConvBlock(nn.Module): ) def _configure_pointwise( - self, image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int + self, in_channels: int, out_channels: int ) -> nn.Sequential: - Conv2d = get_same_padding_conv2d(image_size=image_size) return nn.Sequential( - Conv2d( + nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=1, @@ -176,20 +173,23 @@ class MBConvBlock(nn.Module): ) def _stochastic_depth( - self, x: Tensor, residual: Tensor, drop_connection_rate: Optional[float] + self, x: Tensor, residual: Tensor, stochastic_dropout_rate: Optional[float] ) -> Tensor: - if self.id_skip and self.stride == 1 and self.in_channels == self.out_channels: - if drop_connection_rate: - x = drop_connection(x, p=drop_connection_rate, training=self.training) + if self.stride == (1, 1) and self.in_channels == self.out_channels: + if stochastic_dropout_rate: + x = stochastic_depth( + x, p=stochastic_dropout_rate, training=self.training + ) x += residual return x def forward( - self, x: Tensor, drop_connection_rate: Optional[float] = None + self, x: Tensor, stochastic_dropout_rate: Optional[float] = None ) -> Tensor: residual = x if self._inverted_bottleneck is not None: x = self._inverted_bottleneck(x) + x = F.pad(x, self.pad) x = self._depthwise(x) @@ -201,5 +201,5 @@ class MBConvBlock(nn.Module): x = self._pointwise(x) # Stochastic depth - x = self._stochastic_depth(x, residual, drop_connection_rate) + x = self._stochastic_depth(x, residual, stochastic_dropout_rate) return x diff --git a/text_recognizer/networks/encoders/efficientnet/utils.py b/text_recognizer/networks/encoders/efficientnet/utils.py index ff52485..6f293db 100644 --- a/text_recognizer/networks/encoders/efficientnet/utils.py +++ b/text_recognizer/networks/encoders/efficientnet/utils.py @@ -1,27 +1,15 @@ """Util functions for efficient net.""" from functools import partial import math -from typing import Any, Optional, Tuple, Type +from typing import Any, Optional, Union, Tuple, Type from omegaconf import OmegaConf import torch -from torch import nn, Tensor -import torch.functional as F +from torch import Tensor -def calculate_output_image_size( - image_size: Optional[Tuple[int, int]], stride: int -) -> Optional[Tuple[int, int]]: - """Calculates the output image size when using conv2d with same padding.""" - if image_size is None: - return None - height = int(math.ceil(image_size[0] / stride)) - width = int(math.ceil(image_size[1] / stride)) - return height, width - - -def drop_connection(x: Tensor, p: float, training: bool) -> Tensor: - """Drop connection. +def stochastic_depth(x: Tensor, p: float, training: bool) -> Tensor: + """Stochastic connection. Drops the entire convolution with a given survival probability. @@ -57,91 +45,6 @@ def drop_connection(x: Tensor, p: float, training: bool) -> Tensor: return out -def get_same_padding_conv2d(image_size: Optional[Tuple[int, int]]) -> Type[nn.Conv2d]: - if image_size is None: - return Conv2dDynamicSamePadding - return partial(Conv2dStaticSamePadding, image_size=image_size) - - -class Conv2dDynamicSamePadding(nn.Conv2d): - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: int, - stride: int = 1, - dilation: int = 1, - groups: int = 1, - bias: bool = True, - ) -> None: - super().__init__( - in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias - ) - self.stride = [self.stride] * 2 - - def forward(self, x: Tensor) -> Tensor: - ih, iw = x.shape[-2:] - kh, kw = self.weight.shape[-2:] - sh, sw = self.stride - oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) - pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) - pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) - if pad_h > 0 or pad_w > 0: - x = F.pad( - x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2] - ) - return F.conv2d( - x, - self.weight, - self.bias, - self.stride, - self.padding, - self.dilation, - self.groups, - ) - - -class Conv2dStaticSamePadding(nn.Conv2d): - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: int, - image_size: Tuple[int, int], - stride: int = 1, - **kwargs: Any - ): - super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs) - self.stride = [self.stride] * 2 - - # Calculate padding based on image size and save it. - ih, iw = image_size - kh, kw = self.weight.shape[-2:] - sh, sw = self.stride - oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) - pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) - pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) - if pad_h > 0 or pad_w > 0: - self.static_padding = nn.ZeroPad2d( - (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2) - ) - else: - self.static_padding = nn.Identity() - - def forward(self, x: Tensor) -> Tensor: - x = self.static_padding(x) - x = F.pad( - x, - self.weight, - self.bias, - self.stride, - self.padding, - self.dilation, - self.groups, - ) - return x - - def round_filters(filters: int, arch: Tuple[float, float, float]) -> int: multiplier = arch[0] divisor = 8 @@ -160,7 +63,7 @@ def block_args(): keys = [ "num_repeats", "kernel_size", - "strides", + "stride", "expand_ratio", "in_channels", "out_channels", |