From c7475336442b767182afa5d2c63a02313e1a9a12 Mon Sep 17 00:00:00 2001
From: Gustaf Rydholm <gustaf.rydholm@gmail.com>
Date: Sun, 24 Oct 2021 00:56:16 +0200
Subject: Fix opening of files for transitions

---
 text_recognizer/data/utils/build_transitions.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'text_recognizer')

diff --git a/text_recognizer/data/utils/build_transitions.py b/text_recognizer/data/utils/build_transitions.py
index 0f987ca..65485f0 100644
--- a/text_recognizer/data/utils/build_transitions.py
+++ b/text_recognizer/data/utils/build_transitions.py
@@ -165,19 +165,23 @@ def add_self_loops(pruned_ngrams: List) -> List:
 
 def parse_lines(lines: List, lexicon: Path) -> List:
     """Parses lines with a lexicon."""
-    with open(lexicon, "r") as f:
+    with (lexicon).open("r") as f:
         lex = (line.strip().split() for line in f)
         lex = {line[0]: line[1:] for line in lex}
-        print(len(lex))
-    return [[t for w in line.split(WORDSEP) for t in lex[w]] for line in lines]
+    return [[t for w in line.split(WORDSEP) for t in lex[w]] for line in lines if line]
 
 
 @click.command()
 @click.option("--data_dir", type=str, default=None, help="Path to dataset root.")
 @click.option(
-    "--tokens", type=str, help="Path to token list (in order used with training)."
+    "--tokens",
+    type=str,
+    default="iamdb_1kwp_tokens_1000.txt",
+    help="Path to token list (in order used with training).",
+)
+@click.option(
+    "--lexicon", type=str, default="iamdb_1kwp_lex_1000.txt", help="Path to lexicon"
 )
-@click.option("--lexicon", type=str, default=None, help="Path to lexicon")
 @click.option(
     "--prune",
     nargs=2,
@@ -210,7 +214,7 @@ def cli(
 
     if data_dir is None:
         data_dir = (
-            Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines"
+            Path(__file__).resolve().parents[3] / "data" / "processed" / "iam_lines"
         )
         logger.debug(f"Using data dir: {data_dir}")
         if not data_dir.exists():
@@ -219,10 +223,10 @@ def cli(
         data_dir = Path(data_dir)
 
     # Build table of counts and the back-off if below threshold.
-    with open(data_dir / "train.txt", "r") as f:
+    with (data_dir / "train.txt").open("r") as f:
         lines = [line.strip() for line in f]
 
-    with open(data_dir / tokens, "r") as f:
+    with (data_dir / tokens).open("r") as f:
         tokens = [line.strip() for line in f]
 
     if lexicon is not None:
-- 
cgit v1.2.3-70-g09d2