diff options
Diffstat (limited to 'text_recognizer')
-rw-r--r-- | text_recognizer/data/utils/build_transitions.py | 20 |
1 files changed, 12 insertions, 8 deletions
diff --git a/text_recognizer/data/utils/build_transitions.py b/text_recognizer/data/utils/build_transitions.py index 0f987ca..65485f0 100644 --- a/text_recognizer/data/utils/build_transitions.py +++ b/text_recognizer/data/utils/build_transitions.py @@ -165,19 +165,23 @@ def add_self_loops(pruned_ngrams: List) -> List: def parse_lines(lines: List, lexicon: Path) -> List: """Parses lines with a lexicon.""" - with open(lexicon, "r") as f: + with (lexicon).open("r") as f: lex = (line.strip().split() for line in f) lex = {line[0]: line[1:] for line in lex} - print(len(lex)) - return [[t for w in line.split(WORDSEP) for t in lex[w]] for line in lines] + return [[t for w in line.split(WORDSEP) for t in lex[w]] for line in lines if line] @click.command() @click.option("--data_dir", type=str, default=None, help="Path to dataset root.") @click.option( - "--tokens", type=str, help="Path to token list (in order used with training)." + "--tokens", + type=str, + default="iamdb_1kwp_tokens_1000.txt", + help="Path to token list (in order used with training).", +) +@click.option( + "--lexicon", type=str, default="iamdb_1kwp_lex_1000.txt", help="Path to lexicon" ) -@click.option("--lexicon", type=str, default=None, help="Path to lexicon") @click.option( "--prune", nargs=2, @@ -210,7 +214,7 @@ def cli( if data_dir is None: data_dir = ( - Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines" + Path(__file__).resolve().parents[3] / "data" / "processed" / "iam_lines" ) logger.debug(f"Using data dir: {data_dir}") if not data_dir.exists(): @@ -219,10 +223,10 @@ def cli( data_dir = Path(data_dir) # Build table of counts and the back-off if below threshold. - with open(data_dir / "train.txt", "r") as f: + with (data_dir / "train.txt").open("r") as f: lines = [line.strip() for line in f] - with open(data_dir / tokens, "r") as f: + with (data_dir / tokens).open("r") as f: tokens = [line.strip() for line in f] if lexicon is not None: |