From 27ff7d113108e9cc51ddc5ff13b648b9c75fa865 Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Tue, 27 Sep 2022 00:08:04 +0200 Subject: Add metadata --- text_recognizer/metadata/__init__.py | 1 + text_recognizer/metadata/emnist.py | 106 +++++++++++++++++++++ text_recognizer/metadata/emnist_lines.py | 21 ++++ text_recognizer/metadata/iam.py | 9 ++ text_recognizer/metadata/iam_lines.py | 15 +++ text_recognizer/metadata/iam_paragraphs.py | 18 ++++ .../metadata/iam_synthetic_paragraphs.py | 6 ++ text_recognizer/metadata/shared.py | 4 + 8 files changed, 180 insertions(+) create mode 100644 text_recognizer/metadata/__init__.py create mode 100644 text_recognizer/metadata/emnist.py create mode 100644 text_recognizer/metadata/emnist_lines.py create mode 100644 text_recognizer/metadata/iam.py create mode 100644 text_recognizer/metadata/iam_lines.py create mode 100644 text_recognizer/metadata/iam_paragraphs.py create mode 100644 text_recognizer/metadata/iam_synthetic_paragraphs.py create mode 100644 text_recognizer/metadata/shared.py (limited to 'text_recognizer/metadata') diff --git a/text_recognizer/metadata/__init__.py b/text_recognizer/metadata/__init__.py new file mode 100644 index 0000000..6ca296a --- /dev/null +++ b/text_recognizer/metadata/__init__.py @@ -0,0 +1 @@ +"""Metadata for datasets.""" diff --git a/text_recognizer/metadata/emnist.py b/text_recognizer/metadata/emnist.py new file mode 100644 index 0000000..23ddcc7 --- /dev/null +++ b/text_recognizer/metadata/emnist.py @@ -0,0 +1,106 @@ +from pathlib import Path + +import text_recognizer.metadata.shared as shared + +RAW_DATA_DIRNAME = shared.DATA_DIRNAME / "raw" / "emnist" +METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml" +DL_DATA_DIRNAME = shared.DATA_DIRNAME / "downloaded" / "emnist" +PROCESSED_DATA_DIRNAME = shared.DATA_DIRNAME / "processed" / "emnist" +PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / "byclass.h5" +ESSENTIALS_FILENAME = ( + Path(__file__).parents[1].resolve() / "data" / "emnist_essentials.json" +) + +SEED = 4711 +NUM_SPECIAL_TOKENS = 4 +SAMPLE_TO_BALANCE = True + +INPUT_SHAPE = (28, 28) +DIMS = (1, *INPUT_SHAPE) # Extra dimension added by ToTensor() +OUTPUT_DIMS = (1,) + +MAPPING = [ + "", + "", + "", + "

", + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + " ", + "!", + '"', + "#", + "&", + "'", + "(", + ")", + "*", + "+", + ",", + "-", + ".", + "/", + ":", + ";", + "?", +] diff --git a/text_recognizer/metadata/emnist_lines.py b/text_recognizer/metadata/emnist_lines.py new file mode 100644 index 0000000..5a329bb --- /dev/null +++ b/text_recognizer/metadata/emnist_lines.py @@ -0,0 +1,21 @@ +from pathlib import Path + +import text_recognizer.metadata.emnist as emnist +import text_recognizer.metadata.shared as shared + +PROCESSED_DATA_DIRNAME = shared.DATA_DIRNAME / "processed" / "emnist_lines" +ESSENTIALS_FILENAME = ( + Path(__file__).parents[1].resolve() + / "data" + / "mappings" + / "emnist_lines_essentials.json" +) + +CHAR_HEIGHT, CHAR_WIDTH = emnist.DIMS[1:3] +DIMS = ( + emnist.DIMS[0], + CHAR_HEIGHT, + None, +) # width variable, depends on maximum sequence length + +MAPPING = emnist.MAPPING diff --git a/text_recognizer/metadata/iam.py b/text_recognizer/metadata/iam.py new file mode 100644 index 0000000..6995f83 --- /dev/null +++ b/text_recognizer/metadata/iam.py @@ -0,0 +1,9 @@ +import text_recognizer.metadata.shared as shared + +RAW_DATA_DIRNAME = shared.DATA_DIRNAME / "raw" / "iam" +METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml" +DL_DATA_DIRNAME = shared.DATA_DIRNAME / "downloaded" / "iam" +EXTRACTED_DATASET_DIRNAME = DL_DATA_DIRNAME / "iamdb" + +DOWNSAMPLE_FACTOR = 2 # if images were downsampled, the regions must also be +LINE_REGION_PADDING = 8 # add this many pixels around the exact coordinates diff --git a/text_recognizer/metadata/iam_lines.py b/text_recognizer/metadata/iam_lines.py new file mode 100644 index 0000000..73b2a28 --- /dev/null +++ b/text_recognizer/metadata/iam_lines.py @@ -0,0 +1,15 @@ +import text_recognizer.metadata.emnist as emnist +import text_recognizer.metadata.shared as shared + +PROCESSED_DATA_DIRNAME = shared.DATA_DIRNAME / "processed" / "iam_lines" + +IMAGE_SCALE_FACTOR = 2 + +CHAR_WIDTH = emnist.INPUT_SHAPE[0] // IMAGE_SCALE_FACTOR # rough estimate +IMAGE_HEIGHT = 112 // IMAGE_SCALE_FACTOR +IMAGE_WIDTH = 3072 // IMAGE_SCALE_FACTOR # rounding up IAMLines empirical maximum width + +DIMS = (1, IMAGE_HEIGHT, IMAGE_WIDTH) +OUTPUT_DIMS = (89, 1) + +MAPPING = emnist.MAPPING diff --git a/text_recognizer/metadata/iam_paragraphs.py b/text_recognizer/metadata/iam_paragraphs.py new file mode 100644 index 0000000..70b802b --- /dev/null +++ b/text_recognizer/metadata/iam_paragraphs.py @@ -0,0 +1,18 @@ +import text_recognizer.metadata.emnist as emnist +import text_recognizer.metadata.shared as shared + + +PROCESSED_DATA_DIRNAME = shared.DATA_DIRNAME / "processed" / "iam_paragraphs" + +NEW_LINE_TOKEN = "\n" +MAPPING = [*emnist.MAPPING, NEW_LINE_TOKEN] + +# must match IMAGE_SCALE_FACTOR for IAMLines to be compatible with synthetic paragraphs +IMAGE_SCALE_FACTOR = 2 +IMAGE_HEIGHT, IMAGE_WIDTH = 576, 640 +IMAGE_SHAPE = (IMAGE_HEIGHT, IMAGE_WIDTH) + +MAX_LABEL_LENGTH = 682 + +DIMS = (1, IMAGE_HEIGHT, IMAGE_WIDTH) +OUTPUT_DIMS = (MAX_LABEL_LENGTH, 1) diff --git a/text_recognizer/metadata/iam_synthetic_paragraphs.py b/text_recognizer/metadata/iam_synthetic_paragraphs.py new file mode 100644 index 0000000..15bcfc8 --- /dev/null +++ b/text_recognizer/metadata/iam_synthetic_paragraphs.py @@ -0,0 +1,6 @@ +import text_recognizer.metadata.iam_paragraphs as iam_paragraphs +import text_recognizer.metadata.shared as shared + +NEW_LINE_TOKEN = iam_paragraphs.NEW_LINE_TOKEN + +PROCESSED_DATA_DIRNAME = shared.DATA_DIRNAME / "processed" / "iam_synthetic_paragraphs" diff --git a/text_recognizer/metadata/shared.py b/text_recognizer/metadata/shared.py new file mode 100644 index 0000000..a4d1da0 --- /dev/null +++ b/text_recognizer/metadata/shared.py @@ -0,0 +1,4 @@ +from pathlib import Path + +DATA_DIRNAME = Path(__file__).resolve().parents[2] / "data" +DOWNLOADED_DATA_DIRNAME = DATA_DIRNAME / "downloded" -- cgit v1.2.3-70-g09d2