diff options
-rw-r--r-- | .gitignore | 30 | ||||
-rw-r--r-- | data/raw/emnist/metadata.toml | 3 | ||||
-rw-r--r-- | data/raw/iam/metadata.toml | 4 | ||||
-rw-r--r-- | text_recognizer/__init__.py | 1 | ||||
-rw-r--r-- | text_recognizer/data/emnist.py | 40 |
5 files changed, 47 insertions, 31 deletions
@@ -125,27 +125,19 @@ venv.bak/ .dmypy.json dmypy.json -# Pyre type checker -.pyre/ !data data/* -!data/EMNIST -data/EMNIST/* -!data/EMNIST/raw -data/EMNIST/raw/* -!data/EMNIST/raw/metadata.toml - -src/training/experiments/* -!src/training/experiments/sample_experiment.yml -!src/training/experiments/iam_line_ctc_experiment.yml -!src/training/experiments/line_ctc_experiment.yml -!src/training/experiments/default_config_emnist.yml -!src/training/experiments/embedding_experiment.yml -<<<<<<< HEAD -src/text_recognizer/weights/*Transformer*.pt -======= - ->>>>>>> 6cb08a110620ee09fe9d8a5d008197a801d025df +!data/raw +data/raw/* + +!data/raw/emnist +data/raw/emnist/* +!data/raw/emnist/metadata.toml + +!data/raw/iam +data/raw/iam/* +!data/raw/iam/metadata.toml + src/wandb/* !src/wandb/settings diff --git a/data/raw/emnist/metadata.toml b/data/raw/emnist/metadata.toml new file mode 100644 index 0000000..6017ede --- /dev/null +++ b/data/raw/emnist/metadata.toml @@ -0,0 +1,3 @@ +filename = 'matlab.zip' +sha256 = 'e1fa805cdeae699a52da0b77c2db17f6feb77eed125f9b45c022e7990444df95' +url = 'https://s3-us-west-2.amazonaws.com/fsdl-public-assets/matlab.zip' diff --git a/data/raw/iam/metadata.toml b/data/raw/iam/metadata.toml new file mode 100644 index 0000000..746d90b --- /dev/null +++ b/data/raw/iam/metadata.toml @@ -0,0 +1,4 @@ +url = 'https://s3-us-west-2.amazonaws.com/fsdl-public-assets/iam/iamdb.zip' +filename = 'iamdb.zip' +sha256 = 'f3c9e87a88a313e557c6d3548ed8a2a1af2dc3c4a678c5f3fc6f972ba4a50c55' +test_ids = ["m01-049","m01-060","m01-079","m01-084","m01-090","m01-095","m01-104","m01-121","m01-110","m01-131","m01-115","m01-125","m01-136","m01-149","m01-160","m02-048","m02-052","m02-055","m02-059","m02-066","m02-069","m02-072","m02-075","m02-106","m02-080","m02-083","m02-087","m02-090","m02-095","m02-102","m02-109","m02-112","m03-006","m03-013","m03-020","m03-033","m03-062","m03-095","m03-110","m03-114","m03-118","m04-000","m04-007","m04-012","m04-019","m04-024","m04-030","m04-038","m04-043","m04-061","m04-072","m04-078","m04-081","m04-093","m04-100","m04-107","m04-113","m04-123","m04-131","m04-138","m04-145","m04-152","m04-164","m04-180","m04-251","m04-190","m04-200","m04-209","m04-216","m04-222","m04-231","m04-238","m04-246","n04-000","n04-009","m06-019","n06-148","n06-156","n06-163","n06-169","n06-175","n06-182","n06-186","n06-194","n06-201","m06-031","m06-042","m06-048","m06-056","m06-067","m06-076","m06-083","m06-091","m06-098","m06-106","n01-000","n01-009","n01-004","n01-020","n01-031","n01-045","n01-036","n01-052","n01-057","n02-000","n02-016","n02-004","n02-009","n02-028","n02-049","n02-033","n02-037","n02-040","n02-045","n02-054","n02-062","n02-082","n02-098","p03-057","p03-087","p03-096","p03-103","p03-112","n02-151","n02-154","n02-157","n03-038","n03-064","n03-066","n03-079","n03-082","n03-091","n03-097","n03-103","n03-106","n03-113","n03-120","n03-126","n04-015","n04-022","n04-031","n04-039","n04-044","n04-048","n04-052","n04-060","n04-068","n04-075","n04-084","n04-092","n04-100","n04-107","n04-114","n04-130","n04-139","n04-149","n04-156","n04-163","n04-171","n04-183","n04-190","n04-195","n04-202","n04-209","n04-213","n04-218","n06-074","n06-082","n06-092","n06-100","n06-111","n06-119","n06-123","n06-128","n06-133","n06-140","p01-147","p01-155","p01-168","p01-174","p02-000","p02-008","p02-017","p02-022","p02-027","p02-069","p02-090","p02-076","p02-081","p02-101","p02-105","p02-109","p02-115","p02-121","p02-127","p02-131","p02-135","p02-139","p02-144","p02-150","p02-155","p03-004","p03-009","p03-012","p03-023","p03-027","p03-029","p03-033","p03-040","p03-047","p03-069","p03-072","p03-080","p03-121","p03-135","p03-142","p03-151","p03-158","p03-163","p03-173","p03-181","p03-185","p03-189","p06-030","p06-042","p06-047","p06-052","p06-058","p06-069","p06-088","p06-096","p06-104"] diff --git a/text_recognizer/__init__.py b/text_recognizer/__init__.py index e69de29..20c123d 100644 --- a/text_recognizer/__init__.py +++ b/text_recognizer/__init__.py @@ -0,0 +1 @@ +"""Text recognizer project.""" diff --git a/text_recognizer/data/emnist.py b/text_recognizer/data/emnist.py index 12adaab..bf3faec 100644 --- a/text_recognizer/data/emnist.py +++ b/text_recognizer/data/emnist.py @@ -1,22 +1,22 @@ """EMNIST dataset: downloads it from FSDL aws url if not present.""" -from pathlib import Path -from typing import Dict, List, Optional, Sequence, Tuple import json import os +from pathlib import Path import shutil +from typing import Dict, List, Optional, Sequence, Tuple import zipfile import h5py -import numpy as np from loguru import logger +import numpy as np import toml from torchvision import transforms -from text_recognizer.data.base_dataset import BaseDataset, split_dataset from text_recognizer.data.base_data_module import ( BaseDataModule, load_and_print_info, ) +from text_recognizer.data.base_dataset import BaseDataset, split_dataset from text_recognizer.data.download_utils import download_dataset @@ -33,9 +33,11 @@ ESSENTIALS_FILENAME = Path(__file__).parents[0].resolve() / "emnist_essentials.j class EMNIST(BaseDataModule): - """ - "The EMNIST dataset is a set of handwritten character digits derived from the NIST Special Database 19 - and converted to a 28x28 pixel image format and dataset structure that directly matches the MNIST dataset." + """Lightning DataModule class for loading EMNIST dataset. + + 'The EMNIST dataset is a set of handwritten character digits derived from the NIST + Special Database 19 and converted to a 28x28 pixel image format and dataset structure + that directly matches the MNIST dataset.' From https://www.nist.gov/itl/iad/image-group/emnist-dataset The data split we will use is @@ -56,10 +58,12 @@ class EMNIST(BaseDataModule): self.output_dims = (1,) def prepare_data(self) -> None: + """Downloads dataset if not present.""" if not PROCESSED_DATA_FILENAME.exists(): download_and_process_emnist() def setup(self, stage: str = None) -> None: + """Loads the dataset specified by the stage.""" if stage == "fit" or stage is None: with h5py.File(PROCESSED_DATA_FILENAME, "r") as f: self.x_train = f["x_train"][:] @@ -81,22 +85,32 @@ class EMNIST(BaseDataModule): ) def __repr__(self) -> str: - basic = f"EMNIST Dataset\nNum classes: {len(self.mapping)}\nMapping: {self.mapping}\nDims: {self.dims}\n" + """Returns string with info about the dataset.""" + basic = ( + "EMNIST Dataset\n" + f"Num classes: {len(self.mapping)}\n" + f"Mapping: {self.mapping}\n" + f"Dims: {self.dims}\n" + ) if not any([self.data_train, self.data_val, self.data_test]): return basic datum, target = next(iter(self.train_dataloader())) data = ( - f"Train/val/test sizes: {len(self.data_train)}, {len(self.data_val)}, {len(self.data_test)}\n" - f"Batch x stats: {(datum.shape, datum.dtype, datum.min(), datum.mean(), datum.std(), datum.max())}\n" - f"Batch y stats: {(target.shape, target.dtype, target.min(), target.max())}\n" + "Train/val/test sizes: " + f"{len(self.data_train)}, {len(self.data_val)}, {len(self.data_test)}\n" + "Batch x stats: " + f"{(datum.shape, datum.dtype, datum.min())}" + f"{(datum.mean(), datum.std(), datum.max())}\n" + f"Batch y stats: " + f"{(target.shape, target.dtype, target.min(), target.max())}\n" ) return basic + data def emnist_mapping( - extra_symbols: Optional[Sequence[str]], + extra_symbols: Optional[Sequence[str]] = None, ) -> Tuple[List, Dict[str, int], List[int]]: """Return the EMNIST mapping.""" if not ESSENTIALS_FILENAME.exists(): @@ -112,12 +126,14 @@ def emnist_mapping( def download_and_process_emnist() -> None: + """Downloads and preprocesses EMNIST dataset.""" metadata = toml.load(METADATA_FILENAME) download_dataset(metadata, DL_DATA_DIRNAME) _process_raw_dataset(metadata["filename"], DL_DATA_DIRNAME) def _process_raw_dataset(filename: str, dirname: Path) -> None: + """Processes the raw EMNIST dataset.""" logger.info("Unzipping EMNIST...") curdir = os.getcwd() os.chdir(dirname) |