diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2021-03-29 21:39:58 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2021-03-29 21:39:58 +0200 |
commit | d21594211e29c40c135b753e33b248b0737cd76f (patch) | |
tree | ea37e2701fdc45b27815d8831e6b60ff6888e168 | |
parent | 46a1472d33d3a4180798492e819f2ec02bc3b1a3 (diff) |
Refactor word piece scripts
-rw-r--r-- | README.md | 15 | ||||
-rw-r--r-- | pyproject.toml | 3 | ||||
-rw-r--r-- | text_recognizer/data/build_transitions.py (renamed from tasks/build_transitions.py) | 0 | ||||
-rw-r--r-- | text_recognizer/data/iam_preprocessor.py | 2 | ||||
-rw-r--r-- | text_recognizer/data/make_wordpieces.py (renamed from tasks/make_wordpieces.py) | 4 |
5 files changed, 21 insertions, 3 deletions
@@ -7,8 +7,23 @@ Implementing the text recognizer project from the course ["Full Stack Deep Learn TBC +### Build word piece dataset +Extract text from the iam dataset: +``` +poetry run extract-iam-text --use_words --save_text train.txt --save_tokens letters.txt +``` +Create word pieces from the extracted training text: +``` +poetry run make-wordpieces --output_prefix iamdb_1kwp --text_file train.txt --num_pieces 100 +``` + +Optionally, build a transition graph for word pieces: +``` +poetry run build-transitions --tokens iamdb_1kwp_tokens_1000.txt --lexicon iamdb_1kwp_lex_1000.txt --blank optional --self_loops --save_path 1kwp_prune_0_10_optblank.bin --prune 0 10 +``` +(TODO: Not working atm, needed for GTN loss function) ## Todo - [x] create wordpieces diff --git a/pyproject.toml b/pyproject.toml index ffe9c68..e791dd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,9 @@ download-nltk = "text_recognizer.data.sentence_generator:load_nltk_brown_corpus" generate-emnist-lines = "text_recognizer.data.emnist_lines:generate_emnist_lines" download-iam = "text_recognizer.data.iam:download_iam" generate-iam-lines = "text_recognizer.data.iam_lines:generate_iam_lines" +extract-iam-text = "text_recognizer.data.iam_preprocessor:cli" +make-wordpieces = "text_recognizer.data.make_wordpieces:cli" +build-transitions = "text_recognizer.data.build_transitions:cli" create-emnist-support-files = "text_recognizer.tests.support.create_emnist_support_files:create_emnist_support_files" create-emnist-lines-datasets = "text_recognizer.datasets.emnist_lines_dataset:create_datasets" diff --git a/tasks/build_transitions.py b/text_recognizer/data/build_transitions.py index 91f8c1a..91f8c1a 100644 --- a/tasks/build_transitions.py +++ b/text_recognizer/data/build_transitions.py diff --git a/text_recognizer/data/iam_preprocessor.py b/text_recognizer/data/iam_preprocessor.py index 5d0fad6..a47aeed 100644 --- a/text_recognizer/data/iam_preprocessor.py +++ b/text_recognizer/data/iam_preprocessor.py @@ -166,7 +166,7 @@ def cli( """CLI for extracting text data from the iam dataset.""" if data_dir is None: data_dir = ( - Path(__file__).resolve().parents[3] / "data" / "raw" / "iam" / "iamdb" + Path(__file__).resolve().parents[2] / "data" / "downloaded" / "iam" / "iamdb" ) logger.debug(f"Using data dir: {data_dir}") if not data_dir.exists(): diff --git a/tasks/make_wordpieces.py b/text_recognizer/data/make_wordpieces.py index 2ac0e2c..e062c4c 100644 --- a/tasks/make_wordpieces.py +++ b/text_recognizer/data/make_wordpieces.py @@ -13,7 +13,7 @@ import click from loguru import logger import sentencepiece as spm -from text_recognizer.datasets.iam_preprocessor import load_metadata +from text_recognizer.data.iam_preprocessor import load_metadata def iamdb_pieces( @@ -99,7 +99,7 @@ def cli( """CLI for training the sentence piece model.""" if data_dir is None: data_dir = ( - Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines" + Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines" ) logger.debug(f"Using data dir: {data_dir}") if not data_dir.exists(): |