diff options
Diffstat (limited to 'text_recognizer/networks/encoders/efficientnet')
-rw-r--r-- | text_recognizer/networks/encoders/efficientnet/efficientnet.py | 9 | ||||
-rw-r--r-- | text_recognizer/networks/encoders/efficientnet/mbconv.py (renamed from text_recognizer/networks/encoders/efficientnet/mbconv_block.py) | 56 | ||||
-rw-r--r-- | text_recognizer/networks/encoders/efficientnet/utils.py | 141 |
3 files changed, 185 insertions, 21 deletions
diff --git a/text_recognizer/networks/encoders/efficientnet/efficientnet.py b/text_recognizer/networks/encoders/efficientnet/efficientnet.py new file mode 100644 index 0000000..d953c10 --- /dev/null +++ b/text_recognizer/networks/encoders/efficientnet/efficientnet.py @@ -0,0 +1,9 @@ +"""Efficient net.""" +from torch import nn, Tensor + + +class EfficientNet(nn.Module): + def __init__( + self, + ) -> None: + super().__init__() diff --git a/text_recognizer/networks/encoders/efficientnet/mbconv_block.py b/text_recognizer/networks/encoders/efficientnet/mbconv.py index c501777..602aeb7 100644 --- a/text_recognizer/networks/encoders/efficientnet/mbconv_block.py +++ b/text_recognizer/networks/encoders/efficientnet/mbconv.py @@ -5,7 +5,7 @@ import torch from torch import nn, Tensor from torch.nn import functional as F -from .utils import get_same_padding_conv2d +from .utils import calculate_output_image_size, drop_connection, get_same_padding_conv2d class MBConvBlock(nn.Module): @@ -22,19 +22,23 @@ class MBConvBlock(nn.Module): se_ratio: float, id_skip: bool, expand_ratio: int, - image_size: Tuple[int, int], + image_size: Optional[Tuple[int, int]], ) -> None: super().__init__() self.kernel_size = kernel_size self.bn_momentum = bn_momentum self.bn_eps = bn_eps self.id_skip = id_skip - ( - self._inverted_bottleneck, - self._depthwise, - self._squeeze_excite, - self._pointwise, - ) = self._build( + self.in_channels = self.in_channels + self.out_channels = out_channels + + # Placeholders for layers. + self._inverted_bottleneck: nn.Sequential = None + self._depthwise: nn.Sequential = None + self._squeeze_excite: nn.Sequential = None + self._pointwise: nn.Sequential = None + + self._build( image_size=image_size, in_channels=in_channels, out_channels=out_channels, @@ -46,19 +50,17 @@ class MBConvBlock(nn.Module): def _build( self, - image_size: Tuple[int, int], + image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int, kernel_size: int, stride: int, expand_ratio: int, se_ratio: float, - ) -> Tuple[ - Optional[nn.Sequential], nn.Sequential, Optional[nn.Sequential], nn.Sequential - ]: + ) -> None: has_se = se_ratio is not None and 0.0 < se_ratio < 1.0 inner_channels = in_channels * expand_ratio - inverted_bottleneck = ( + self._inverted_bottleneck = ( self._configure_inverted_bottleneck( image_size=image_size, in_channels=in_channels, @@ -68,7 +70,7 @@ class MBConvBlock(nn.Module): else None ) - depthwise = self._configure_depthwise( + self._depthwise = self._configure_depthwise( image_size=image_size, in_channels=inner_channels, out_channels=inner_channels, @@ -78,7 +80,7 @@ class MBConvBlock(nn.Module): ) image_size = calculate_output_image_size(image_size, stride) - squeeze_excite = ( + self._squeeze_excite = ( self._configure_squeeze_excite( in_channels=inner_channels, out_channels=inner_channels, @@ -88,14 +90,13 @@ class MBConvBlock(nn.Module): else None ) - pointwise = self._configure_pointwise( + self._pointwise = self._configure_pointwise( image_size=image_size, in_channels=inner_channels, out_channels=out_channels ) - return inverted_bottleneck, depthwise, squeeze_excite, pointwise def _configure_inverted_bottleneck( self, - image_size: Tuple[int, int], + image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int, ) -> nn.Sequential: @@ -116,7 +117,7 @@ class MBConvBlock(nn.Module): def _configure_depthwise( self, - image_size: Tuple[int, int], + image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int, groups: int, @@ -159,7 +160,7 @@ class MBConvBlock(nn.Module): ) def _configure_pointwise( - self, image_size: Tuple[int, int], in_channels: int, out_channels: int + self, image_size: Optional[Tuple[int, int]], in_channels: int, out_channels: int ) -> nn.Sequential: Conv2d = get_same_padding_conv2d(image_size=image_size) return nn.Sequential( @@ -174,7 +175,18 @@ class MBConvBlock(nn.Module): ), ) - def forward(self, x: Tensor, drop_connection_rate: Optional[float]) -> Tensor: + def _stochastic_depth( + self, x: Tensor, residual: Tensor, drop_connection_rate: Optional[float] + ) -> Tensor: + if self.id_skip and self.stride == 1 and self.in_channels == self.out_channels: + if drop_connection_rate: + x = drop_connection(x, p=drop_connection_rate, training=self.training) + x += residual + return x + + def forward( + self, x: Tensor, drop_connection_rate: Optional[float] = None + ) -> Tensor: residual = x if self._inverted_bottleneck is not None: x = self._inverted_bottleneck(x) @@ -189,3 +201,5 @@ class MBConvBlock(nn.Module): x = self._pointwise(x) # Stochastic depth + x = self._stochastic_depth(x, residual, drop_connection_rate) + return x diff --git a/text_recognizer/networks/encoders/efficientnet/utils.py b/text_recognizer/networks/encoders/efficientnet/utils.py new file mode 100644 index 0000000..4b4a787 --- /dev/null +++ b/text_recognizer/networks/encoders/efficientnet/utils.py @@ -0,0 +1,141 @@ +"""Util functions for efficient net.""" +from functools import partial +import math +from typing import Any, Optional, Tuple, Type + +import torch +from torch import nn, Tensor +import torch.functional as F + + +def calculate_output_image_size( + image_size: Optional[Tuple[int, int]], stride: int +) -> Optional[Tuple[int, int]]: + """Calculates the output image size when using conv2d with same padding.""" + if image_size is None: + return None + height = int(math.ceil(image_size[0] / stride)) + width = int(math.ceil(image_size[1] / stride)) + return height, width + + +def drop_connection(x: Tensor, p: float, training: bool) -> Tensor: + """Drop connection. + + Drops the entire convolution with a given survival probability. + + Args: + x (Tensor): Input tensor. + p (float): Survival probability between 0.0 and 1.0. + training (bool): The running mode. + + Shapes: + - x: :math: `(B, C, W, H)`. + - out: :math: `(B, C, W, H)`. + + where B is the batch size, C is the number of channels, W is the width, and H + is the height. + + Returns: + out (Tensor): Output after drop connection. + """ + assert 0.0 <= p <= 1.0, "p must be in range of [0, 1]" + + if not training: + return x + + bsz = x.shape[0] + survival_prob = 1 - p + + # Generate a binary tensor mask according to probability (p for 0, 1-p for 1) + random_tensor = survival_prob + random_tensor += torch.rand([bsz, 1, 1, 1]).type_as(x) + binary_tensor = torch.floor(random_tensor) + + out = x / survival_prob * binary_tensor + return out + + +def get_same_padding_conv2d(image_size: Optional[Tuple[int, int]]) -> Type[nn.Conv2d]: + if image_size is None: + return Conv2dDynamicSamePadding + return partial(Conv2dStaticSamePadding, image_size=image_size) + + +class Conv2dDynamicSamePadding(nn.Conv2d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + bias: bool = True, + ) -> None: + super().__init__( + in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias + ) + self.stride = [self.stride] * 2 + + def forward(self, x: Tensor) -> Tensor: + ih, iw = x.shape[-2:] + kh, kw = self.weight.shape[-2:] + sh, sw = self.stride + oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) + pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) + pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) + if pad_h > 0 or pad_w > 0: + x = F.pad( + x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2] + ) + return F.conv2d( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + + +class Conv2dStaticSamePadding(nn.Conv2d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + image_size: Tuple[int, int], + stride: int = 1, + **kwargs: Any + ): + super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs) + self.stride = [self.stride] * 2 + + # Calculate padding based on image size and save it. + ih, iw = image_size + kh, kw = self.weight.shape[-2:] + sh, sw = self.stride + oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) + pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) + pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) + if pad_h > 0 or pad_w > 0: + self.static_padding = nn.ZeroPad2d( + (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2) + ) + else: + self.static_padding = nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + x = self.static_padding(x) + x = F.pad( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + return x |