summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2021-03-29 21:39:58 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2021-03-29 21:39:58 +0200
commitd21594211e29c40c135b753e33b248b0737cd76f (patch)
treeea37e2701fdc45b27815d8831e6b60ff6888e168
parent46a1472d33d3a4180798492e819f2ec02bc3b1a3 (diff)
Refactor word piece scripts
-rw-r--r--README.md15
-rw-r--r--pyproject.toml3
-rw-r--r--text_recognizer/data/build_transitions.py (renamed from tasks/build_transitions.py)0
-rw-r--r--text_recognizer/data/iam_preprocessor.py2
-rw-r--r--text_recognizer/data/make_wordpieces.py (renamed from tasks/make_wordpieces.py)4
5 files changed, 21 insertions, 3 deletions
diff --git a/README.md b/README.md
index ac4acd8..cfe37ff 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,23 @@ Implementing the text recognizer project from the course ["Full Stack Deep Learn
TBC
+### Build word piece dataset
+Extract text from the iam dataset:
+```
+poetry run extract-iam-text --use_words --save_text train.txt --save_tokens letters.txt
+```
+Create word pieces from the extracted training text:
+```
+poetry run make-wordpieces --output_prefix iamdb_1kwp --text_file train.txt --num_pieces 100
+```
+
+Optionally, build a transition graph for word pieces:
+```
+poetry run build-transitions --tokens iamdb_1kwp_tokens_1000.txt --lexicon iamdb_1kwp_lex_1000.txt --blank optional --self_loops --save_path 1kwp_prune_0_10_optblank.bin --prune 0 10
+```
+(TODO: Not working atm, needed for GTN loss function)
## Todo
- [x] create wordpieces
diff --git a/pyproject.toml b/pyproject.toml
index ffe9c68..e791dd9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,9 @@ download-nltk = "text_recognizer.data.sentence_generator:load_nltk_brown_corpus"
generate-emnist-lines = "text_recognizer.data.emnist_lines:generate_emnist_lines"
download-iam = "text_recognizer.data.iam:download_iam"
generate-iam-lines = "text_recognizer.data.iam_lines:generate_iam_lines"
+extract-iam-text = "text_recognizer.data.iam_preprocessor:cli"
+make-wordpieces = "text_recognizer.data.make_wordpieces:cli"
+build-transitions = "text_recognizer.data.build_transitions:cli"
create-emnist-support-files = "text_recognizer.tests.support.create_emnist_support_files:create_emnist_support_files"
create-emnist-lines-datasets = "text_recognizer.datasets.emnist_lines_dataset:create_datasets"
diff --git a/tasks/build_transitions.py b/text_recognizer/data/build_transitions.py
index 91f8c1a..91f8c1a 100644
--- a/tasks/build_transitions.py
+++ b/text_recognizer/data/build_transitions.py
diff --git a/text_recognizer/data/iam_preprocessor.py b/text_recognizer/data/iam_preprocessor.py
index 5d0fad6..a47aeed 100644
--- a/text_recognizer/data/iam_preprocessor.py
+++ b/text_recognizer/data/iam_preprocessor.py
@@ -166,7 +166,7 @@ def cli(
"""CLI for extracting text data from the iam dataset."""
if data_dir is None:
data_dir = (
- Path(__file__).resolve().parents[3] / "data" / "raw" / "iam" / "iamdb"
+ Path(__file__).resolve().parents[2] / "data" / "downloaded" / "iam" / "iamdb"
)
logger.debug(f"Using data dir: {data_dir}")
if not data_dir.exists():
diff --git a/tasks/make_wordpieces.py b/text_recognizer/data/make_wordpieces.py
index 2ac0e2c..e062c4c 100644
--- a/tasks/make_wordpieces.py
+++ b/text_recognizer/data/make_wordpieces.py
@@ -13,7 +13,7 @@ import click
from loguru import logger
import sentencepiece as spm
-from text_recognizer.datasets.iam_preprocessor import load_metadata
+from text_recognizer.data.iam_preprocessor import load_metadata
def iamdb_pieces(
@@ -99,7 +99,7 @@ def cli(
"""CLI for training the sentence piece model."""
if data_dir is None:
data_dir = (
- Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines"
+ Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines"
)
logger.debug(f"Using data dir: {data_dir}")
if not data_dir.exists():