diff options
Diffstat (limited to 'text_recognizer/data/utils')
-rw-r--r-- | text_recognizer/data/utils/iam_preprocessor.py | 28 |
1 files changed, 20 insertions, 8 deletions
diff --git a/text_recognizer/data/utils/iam_preprocessor.py b/text_recognizer/data/utils/iam_preprocessor.py index 60ecff1..4f95007 100644 --- a/text_recognizer/data/utils/iam_preprocessor.py +++ b/text_recognizer/data/utils/iam_preprocessor.py @@ -47,19 +47,28 @@ class Preprocessor: def __init__( self, - data_dir: Union[str, Path], num_features: int, - tokens_path: Optional[Union[str, Path]] = None, - lexicon_path: Optional[Union[str, Path]] = None, + tokens: Optional[str] = None, + lexicon: Optional[str] = None, use_words: bool = False, prepend_wordsep: bool = False, special_tokens: Optional[Set[str]] = None, ) -> None: + self.data_dir = ( + Path(__file__).resolve().parents[3] + / "data" + / "downloaded" + / "iam" + / "iamdb" + ) + log.debug(f"Using data dir: {self.data_dir}") + if not self.data_dir.exists(): + raise RuntimeError(f"Could not locate iamdb directory at {self.data_dir}") + self.wordsep = "▁" self._use_word = use_words self._prepend_wordsep = prepend_wordsep self.special_tokens = special_tokens if special_tokens is not None else None - self.data_dir = Path(data_dir) self.forms = load_metadata(self.data_dir, self.wordsep, use_words=use_words) # Load the set of graphemes: @@ -70,14 +79,17 @@ class Preprocessor: self.graphemes = sorted(graphemes) # Build the token-to-index and index-to-token maps. - if tokens_path is not None: - with open(tokens_path, "r") as f: + processed_path = ( + Path(__file__).resolve().parents[3] / "data" / "processed" / "iam_lines" + ) + if tokens is not None: + with open(processed_path / tokens, "r") as f: self.tokens = [line.strip() for line in f] else: self.tokens = self.graphemes - if lexicon_path is not None: - with open(lexicon_path, "r") as f: + if lexicon is not None: + with open(processed_path / lexicon, "r") as f: lexicon = (line.strip().split() for line in f) lexicon = {line[0]: line[1:] for line in lexicon} self.lexicon = lexicon |