1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
"""Mapping to and from word pieces."""
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional, Union, Sequence
from loguru import logger
import torch
from torch import Tensor
from text_recognizer.data.emnist import emnist_mapping
from text_recognizer.data.iam_preprocessor import Preprocessor
class AbstractMapping(ABC):
@abstractmethod
def get_token(self, *args, **kwargs) -> str:
...
@abstractmethod
def get_index(self, *args, **kwargs) -> Tensor:
...
@abstractmethod
def get_text(self, *args, **kwargs) -> str:
...
@abstractmethod
def get_indices(self, *args, **kwargs) -> Tensor:
...
class EmnistMapping(AbstractMapping):
def __init__(self, extra_symbols: Optional[Sequence[str]]) -> None:
self.mapping, self.inverse_mapping, self.input_size = emnist_mapping(
extra_symbols
)
def get_token(self, index: Union[int, Tensor]) -> str:
if (index := int(index)) in self.mapping:
return self.mapping[index]
raise KeyError(f"Index ({index}) not in mapping.")
def get_index(self, token: str) -> Tensor:
if token in self.inverse_mapping:
return Tensor(self.inverse_mapping[token])
raise KeyError(f"Token ({token}) not found in inverse mapping.")
def get_text(self, indices: Union[List[int], Tensor]) -> str:
if isinstance(indices, Tensor):
indices = indices.tolist()
return "".join([self.mapping[index] for index in indices])
def get_indices(self, text: str) -> Tensor:
return Tensor([self.inverse_mapping[token] for token in text])
class WordPieceMapping(EmnistMapping):
def __init__(
self,
num_features: int = 1000,
tokens: str = "iamdb_1kwp_tokens_1000.txt",
lexicon: str = "iamdb_1kwp_lex_1000.txt",
data_dir: Optional[Union[str, Path]] = None,
use_words: bool = False,
prepend_wordsep: bool = False,
special_tokens: Sequence[str] = ("<s>", "<e>", "<p>"),
extra_symbols: Optional[Sequence[str]] = ("\n",),
) -> None:
super().__init__(extra_symbols)
self.wordpiece_processor = self._configure_wordpiece_processor(
num_features,
tokens,
lexicon,
data_dir,
use_words,
prepend_wordsep,
special_tokens,
extra_symbols,
)
@staticmethod
def _configure_wordpiece_processor(
num_features: int,
tokens: str,
lexicon: str,
data_dir: Optional[Union[str, Path]],
use_words: bool,
prepend_wordsep: bool,
special_tokens: Optional[Sequence[str]],
extra_symbols: Optional[Sequence[str]],
) -> Preprocessor:
data_dir = (
(
Path(__file__).resolve().parents[2]
/ "data"
/ "downloaded"
/ "iam"
/ "iamdb"
)
if data_dir is None
else Path(data_dir)
)
logger.debug(f"Using data dir: {data_dir}")
if not data_dir.exists():
raise RuntimeError(f"Could not locate iamdb directory at {data_dir}")
processed_path = (
Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines"
)
tokens_path = processed_path / tokens
lexicon_path = processed_path / lexicon
if extra_symbols is not None:
special_tokens += extra_symbols
return Preprocessor(
data_dir,
num_features,
tokens_path,
lexicon_path,
use_words,
prepend_wordsep,
special_tokens,
)
def __len__(self) -> int:
return len(self.wordpiece_processor.tokens)
def get_token(self, index: Union[int, Tensor]) -> str:
if (index := int(index)) <= self.wordpiece_processor.num_tokens:
return self.wordpiece_processor.tokens[index]
raise KeyError(f"Index ({index}) not in mapping.")
def get_index(self, token: str) -> Tensor:
if token in self.wordpiece_processor.tokens:
return torch.LongTensor([self.wordpiece_processor.tokens_to_index[token]])
raise KeyError(f"Token ({token}) not found in inverse mapping.")
def get_text(self, indices: Union[List[int], Tensor]) -> str:
if isinstance(indices, Tensor):
indices = indices.tolist()
return self.wordpiece_processor.to_text(indices)
def get_indices(self, text: str) -> Tensor:
return self.wordpiece_processor.to_index(text)
def emnist_to_wordpiece_indices(self, x: Tensor) -> Tensor:
text = "".join([self.mapping[i] for i in x])
text = text.lower().replace(" ", "▁")
return torch.LongTensor(self.wordpiece_processor.to_index(text))
def __getitem__(self, x: Union[str, int, Tensor]) -> Union[str, Tensor]:
if isinstance(x, str):
return self.get_index(x)
return self.get_token(x)
|