summaryrefslogtreecommitdiff
path: root/text_recognizer/data
diff options
context:
space:
mode:
Diffstat (limited to 'text_recognizer/data')
-rw-r--r--text_recognizer/data/emnist_mapping.py14
-rw-r--r--text_recognizer/data/transforms.py3
2 files changed, 13 insertions, 4 deletions
diff --git a/text_recognizer/data/emnist_mapping.py b/text_recognizer/data/emnist_mapping.py
index 925d214..3e91594 100644
--- a/text_recognizer/data/emnist_mapping.py
+++ b/text_recognizer/data/emnist_mapping.py
@@ -9,15 +9,23 @@ from text_recognizer.data.emnist import emnist_mapping
class EmnistMapping(AbstractMapping):
- def __init__(self, extra_symbols: Optional[Set[str]] = None) -> None:
+ def __init__(self, extra_symbols: Optional[Set[str]] = None, lower: bool = True) -> None:
self.extra_symbols = set(extra_symbols) if extra_symbols is not None else None
self.mapping, self.inverse_mapping, self.input_size = emnist_mapping(
self.extra_symbols
)
+ if lower:
+ self._to_lower()
super().__init__(self.input_size, self.mapping, self.inverse_mapping)
- def __attrs_post_init__(self) -> None:
- """Post init configuration."""
+ def _to_lower(self) -> None:
+ """Converts mapping to lowercase letters only."""
+ def _filter(x: int) -> int:
+ if 40 <= x:
+ return x - 26
+ return x
+ self.inverse_mapping = {v: _filter(k) for k, v in enumerate(self.mapping)}
+ self.mapping = [c for c in self.mapping if not c.isupper()]
def get_token(self, index: Union[int, Tensor]) -> str:
if (index := int(index)) <= len(self.mapping):
diff --git a/text_recognizer/data/transforms.py b/text_recognizer/data/transforms.py
index 047496f..51f52de 100644
--- a/text_recognizer/data/transforms.py
+++ b/text_recognizer/data/transforms.py
@@ -1,10 +1,11 @@
"""Transforms for PyTorch datasets."""
from pathlib import Path
-from typing import Optional, Union, Set
+from typing import Optional, Union, Type, Set
import torch
from torch import Tensor
+from text_recognizer.data.base_mapping import AbstractMapping
from text_recognizer.data.word_piece_mapping import WordPieceMapping