summaryrefslogtreecommitdiff
path: root/src/text_recognizer
diff options
context:
space:
mode:
authoraktersnurra <grydholm@kth.se>2020-12-02 23:48:52 +0100
committeraktersnurra <grydholm@kth.se>2020-12-02 23:48:52 +0100
commit5529e0fc9ca39e81fe0f08a54f257d32f0afe120 (patch)
treef2be992554e278857db7d56786dba54a76d439c7 /src/text_recognizer
parente3b039c9adb4bce42ede4cb682a3ae71e797539a (diff)
parent8e3985c9cde6666e4314973312135ec1c7a025b9 (diff)
Merge branch 'master' of github.com:aktersnurra/text-recognizer
Diffstat (limited to 'src/text_recognizer')
-rw-r--r--src/text_recognizer/datasets/transforms.py15
-rw-r--r--src/text_recognizer/line_predictor.py4
-rw-r--r--src/text_recognizer/models/__init__.py5
-rw-r--r--src/text_recognizer/networks/__init__.py5
-rw-r--r--src/text_recognizer/networks/crnn.py12
-rw-r--r--src/text_recognizer/networks/metrics.py (renamed from src/text_recognizer/models/metrics.py)0
6 files changed, 28 insertions, 13 deletions
diff --git a/src/text_recognizer/datasets/transforms.py b/src/text_recognizer/datasets/transforms.py
index 1105f23..1ec23dc 100644
--- a/src/text_recognizer/datasets/transforms.py
+++ b/src/text_recognizer/datasets/transforms.py
@@ -4,7 +4,7 @@ from PIL import Image
import torch
from torch import Tensor
import torch.nn.functional as F
-from torchvision.transforms import Compose, ToPILImage, ToTensor
+from torchvision.transforms import Compose, RandomAffine, ToTensor
from text_recognizer.datasets.util import EmnistMapper
@@ -64,3 +64,16 @@ class AddTokens:
target = torch.cat([sos, target], dim=0)
return target
+
+
+class ApplyContrast:
+ """Sets everything below a threshold to zero, i.e. increase contrast."""
+
+ def __init__(self, low: float = 0.0, high: float = 0.25) -> None:
+ self.low = low
+ self.high = high
+
+ def __call__(self, x: Tensor) -> Tensor:
+ """Apply mask binary mask to input tensor."""
+ mask = x > np.random.RandomState().uniform(low=self.low, high=self.high)
+ return x * mask
diff --git a/src/text_recognizer/line_predictor.py b/src/text_recognizer/line_predictor.py
index 981e2c9..8e348fe 100644
--- a/src/text_recognizer/line_predictor.py
+++ b/src/text_recognizer/line_predictor.py
@@ -6,7 +6,7 @@ import numpy as np
from torch import nn
from text_recognizer import datasets, networks
-from text_recognizer.models import VisionTransformerModel
+from text_recognizer.models import TransformerModel
from text_recognizer.util import read_image
@@ -16,7 +16,7 @@ class LinePredictor:
def __init__(self, dataset: str, network_fn: str) -> None:
network_fn = getattr(networks, network_fn)
dataset = getattr(datasets, dataset)
- self.model = VisionTransformerModel(network_fn=network_fn, dataset=dataset)
+ self.model = TransformerModel(network_fn=network_fn, dataset=dataset)
self.model.eval()
def predict(self, image_or_filename: Union[np.ndarray, str]) -> Tuple[str, float]:
diff --git a/src/text_recognizer/models/__init__.py b/src/text_recognizer/models/__init__.py
index 53340f1..bf89404 100644
--- a/src/text_recognizer/models/__init__.py
+++ b/src/text_recognizer/models/__init__.py
@@ -2,16 +2,11 @@
from .base import Model
from .character_model import CharacterModel
from .crnn_model import CRNNModel
-from .metrics import accuracy, accuracy_ignore_pad, cer, wer
from .transformer_model import TransformerModel
__all__ = [
- "accuracy",
- "accuracy_ignore_pad",
- "cer",
"CharacterModel",
"CRNNModel",
"Model",
"TransformerModel",
- "wer",
]
diff --git a/src/text_recognizer/networks/__init__.py b/src/text_recognizer/networks/__init__.py
index 67e245c..1635039 100644
--- a/src/text_recognizer/networks/__init__.py
+++ b/src/text_recognizer/networks/__init__.py
@@ -4,6 +4,7 @@ from .crnn import ConvolutionalRecurrentNetwork
from .ctc import greedy_decoder
from .densenet import DenseNet
from .lenet import LeNet
+from .metrics import accuracy, accuracy_ignore_pad, cer, wer
from .mlp import MLP
from .residual_network import ResidualNetwork, ResidualNetworkEncoder
from .transformer import Transformer
@@ -11,6 +12,9 @@ from .util import sliding_window
from .wide_resnet import WideResidualNetwork
__all__ = [
+ "accuracy",
+ "accuracy_ignore_pad",
+ "cer",
"CNNTransformer",
"ConvolutionalRecurrentNetwork",
"DenseNet",
@@ -21,5 +25,6 @@ __all__ = [
"ResidualNetworkEncoder",
"sliding_window",
"Transformer",
+ "wer",
"WideResidualNetwork",
]
diff --git a/src/text_recognizer/networks/crnn.py b/src/text_recognizer/networks/crnn.py
index 9747429..778e232 100644
--- a/src/text_recognizer/networks/crnn.py
+++ b/src/text_recognizer/networks/crnn.py
@@ -1,4 +1,4 @@
-"""LSTM with CTC for handwritten text recognition within a line."""
+"""CRNN for handwritten text recognition."""
from typing import Dict, Tuple
from einops import rearrange, reduce
@@ -89,20 +89,22 @@ class ConvolutionalRecurrentNetwork(nn.Module):
x = self.backbone(x)
- # Avgerage pooling.
+ # Average pooling.
if self.avg_pool:
x = reduce(x, "(b t) c h w -> t b c", "mean", b=b, t=t)
else:
x = rearrange(x, "(b t) h -> t b h", b=b, t=t)
else:
# Encode the entire image with a CNN, and use the channels as temporal dimension.
- b = x.shape[0]
x = self.backbone(x)
- x = rearrange(x, "b c h w -> c b (h w)", b=b)
+ x = rearrange(x, "b c h w -> b w c h")
+ if self.adaptive_pool is not None:
+ x = self.adaptive_pool(x)
+ x = x.squeeze(3)
# Sequence predictions.
x, _ = self.rnn(x)
- # Sequence to classifcation layer.
+ # Sequence to classification layer.
x = self.decoder(x)
return x
diff --git a/src/text_recognizer/models/metrics.py b/src/text_recognizer/networks/metrics.py
index af9adb5..af9adb5 100644
--- a/src/text_recognizer/models/metrics.py
+++ b/src/text_recognizer/networks/metrics.py