summaryrefslogtreecommitdiff
path: root/text_recognizer/data/utils
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2021-10-25 22:31:36 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2021-10-25 22:31:36 +0200
commitda29c1cf4d062087f1b29dc9402ee6384203b690 (patch)
treea19b035990864a0be3d23faf3cf21382d60c5eda /text_recognizer/data/utils
parent00dd3df9f2e29622248668662cb40ff0c8889145 (diff)
Make data dir static in preprocessor and word piece
Diffstat (limited to 'text_recognizer/data/utils')
-rw-r--r--text_recognizer/data/utils/iam_preprocessor.py28
1 files changed, 20 insertions, 8 deletions
diff --git a/text_recognizer/data/utils/iam_preprocessor.py b/text_recognizer/data/utils/iam_preprocessor.py
index 60ecff1..4f95007 100644
--- a/text_recognizer/data/utils/iam_preprocessor.py
+++ b/text_recognizer/data/utils/iam_preprocessor.py
@@ -47,19 +47,28 @@ class Preprocessor:
def __init__(
self,
- data_dir: Union[str, Path],
num_features: int,
- tokens_path: Optional[Union[str, Path]] = None,
- lexicon_path: Optional[Union[str, Path]] = None,
+ tokens: Optional[str] = None,
+ lexicon: Optional[str] = None,
use_words: bool = False,
prepend_wordsep: bool = False,
special_tokens: Optional[Set[str]] = None,
) -> None:
+ self.data_dir = (
+ Path(__file__).resolve().parents[3]
+ / "data"
+ / "downloaded"
+ / "iam"
+ / "iamdb"
+ )
+ log.debug(f"Using data dir: {self.data_dir}")
+ if not self.data_dir.exists():
+ raise RuntimeError(f"Could not locate iamdb directory at {self.data_dir}")
+
self.wordsep = "▁"
self._use_word = use_words
self._prepend_wordsep = prepend_wordsep
self.special_tokens = special_tokens if special_tokens is not None else None
- self.data_dir = Path(data_dir)
self.forms = load_metadata(self.data_dir, self.wordsep, use_words=use_words)
# Load the set of graphemes:
@@ -70,14 +79,17 @@ class Preprocessor:
self.graphemes = sorted(graphemes)
# Build the token-to-index and index-to-token maps.
- if tokens_path is not None:
- with open(tokens_path, "r") as f:
+ processed_path = (
+ Path(__file__).resolve().parents[3] / "data" / "processed" / "iam_lines"
+ )
+ if tokens is not None:
+ with open(processed_path / tokens, "r") as f:
self.tokens = [line.strip() for line in f]
else:
self.tokens = self.graphemes
- if lexicon_path is not None:
- with open(lexicon_path, "r") as f:
+ if lexicon is not None:
+ with open(processed_path / lexicon, "r") as f:
lexicon = (line.strip().split() for line in f)
lexicon = {line[0]: line[1:] for line in lexicon}
self.lexicon = lexicon