From 46a1472d33d3a4180798492e819f2ec02bc3b1a3 Mon Sep 17 00:00:00 2001
From: Gustaf Rydholm <gustaf.rydholm@gmail.com>
Date: Sun, 28 Mar 2021 22:02:24 +0200
Subject: Add refactor of iam lines

---
 text_recognizer/data/iam_lines.py | 255 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 text_recognizer/data/iam_lines.py

(limited to 'text_recognizer/data/iam_lines.py')

diff --git a/text_recognizer/data/iam_lines.py b/text_recognizer/data/iam_lines.py
new file mode 100644
index 0000000..391075a
--- /dev/null
+++ b/text_recognizer/data/iam_lines.py
@@ -0,0 +1,255 @@
+"""Class for IAM Lines dataset.
+
+If not created, will generate a handwritten lines dataset from the IAM paragraphs
+dataset.
+
+"""
+import json
+from pathlib import Path
+import random
+from typing import List, Sequence, Tuple
+
+from loguru import logger
+from PIL import Image, ImageFile, ImageOps
+import numpy as np
+from torch import Tensor
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+from text_recognizer.data.base_dataset import (
+    BaseDataset,
+    convert_strings_to_labels,
+    split_dataset,
+)
+from text_recognizer.data.base_data_module import BaseDataModule, load_and_print_info
+from text_recognizer.data.emnist import emnist_mapping
+from text_recognizer.data.iam import IAM
+from text_recognizer.data import image_utils
+
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+SEED = 4711
+PROCESSED_DATA_DIRNAME = BaseDataModule.data_dirname() / "processed" / "iam_lines"
+IMAGE_HEIGHT = 56
+IMAGE_WIDTH = 1024
+
+
+class IAMLines(BaseDataModule):
+    """IAM handwritten lines dataset."""
+
+    def __init__(
+        self,
+        augment: bool = True,
+        fraction: float = 0.8,
+        batch_size: int = 128,
+        num_workers: int = 0,
+    ) -> None:
+        # TODO: add transforms
+        super().__init__(batch_size, num_workers)
+        self.augment = augment
+        self.fraction = fraction
+        self.mapping, self.inverse_mapping, _ = emnist_mapping()
+        self.dims = (1, IMAGE_HEIGHT, IMAGE_WIDTH)
+        self.output_dims = (89, 1)
+        self.data_train: BaseDataset = None
+        self.data_val: BaseDataset = None
+        self.data_test: BaseDataset = None
+
+    def prepare_data(self) -> None:
+        """Creates the IAM lines dataset if not existing."""
+        if PROCESSED_DATA_DIRNAME.exists():
+            return
+
+        logger.info("Cropping IAM lines regions...")
+        iam = IAM()
+        iam.prepare_data()
+        crops_train, labels_train = line_crops_and_labels(iam, "train")
+        crops_test, labels_test = line_crops_and_labels(iam, "test")
+
+        shapes = np.array([crop.size for crop in crops_train + crops_test])
+        aspect_ratios = shapes[:, 0] / shapes[:, 1]
+
+        logger.info("Saving images, labels, and statistics...")
+        save_images_and_labels(
+            crops_train, labels_train, "train", PROCESSED_DATA_DIRNAME
+        )
+        save_images_and_labels(crops_test, labels_test, "test", PROCESSED_DATA_DIRNAME)
+
+        with (PROCESSED_DATA_DIRNAME / "_max_aspect_ratio.txt").open(mode="w") as f:
+            f.write(str(aspect_ratios.max()))
+
+    def setup(self, stage: str = None) -> None:
+        """Load data for training/testing."""
+        with (PROCESSED_DATA_DIRNAME / "_max_aspect_ratio.txt").open(mode="r") as f:
+            max_aspect_ratio = float(f.read())
+            image_width = int(IMAGE_HEIGHT * max_aspect_ratio)
+            if image_width >= IMAGE_WIDTH:
+                raise ValueError("image_width equal or greater than IMAGE_WIDTH")
+
+        if stage == "fit" or stage is None:
+            x_train, labels_train = load_line_crops_and_labels(
+                "train", PROCESSED_DATA_DIRNAME
+            )
+            if self.output_dims[0] < max([len(l) for l in labels_train]) + 2:
+                raise ValueError("Target length longer than max output length.")
+
+            y_train = convert_strings_to_labels(
+                labels_train, self.inverse_mapping, length=self.output_dims[0]
+            )
+            data_train = BaseDataset(
+                x_train, y_train, transform=get_transform(IMAGE_WIDTH, self.augment)
+            )
+
+            self.data_train, self.data_val = split_dataset(
+                dataset=data_train, fraction=self.fraction, seed=SEED
+            )
+
+        if stage == "test" or stage is None:
+            x_test, labels_test = load_line_crops_and_labels(
+                "test", PROCESSED_DATA_DIRNAME
+            )
+
+            if self.output_dims[0] < max([len(l) for l in labels_test]) + 2:
+                raise ValueError("Taget length longer than max output length.")
+
+            y_test = convert_strings_to_labels(
+                labels_test, self.inverse_mapping, length=self.output_dims[0]
+            )
+            self.data_test = BaseDataset(
+                x_test, y_test, transform=get_transform(IMAGE_WIDTH)
+            )
+
+        if stage is None:
+            self._verify_output_dims(labels_train, labels_test)
+
+    def _verify_output_dims(self, labels_train: Tensor, labels_test: Tensor) -> None:
+        max_label_length = max([len(label) for label in labels_train + labels_test]) + 2
+        output_dims = (max_label_length, 1)
+        if output_dims != self.output_dims:
+            raise ValueError("Output dim does not match expected output dims.")
+
+    def __repr__(self) -> str:
+        """Return information about the dataset."""
+        basic = (
+            "IAM Lines dataset\n"
+            f"Num classes: {len(self.mapping)}\n"
+            f"Input dims: {self.dims}\n"
+            f"Output dims: {self.output_dims}\n"
+        )
+
+        if not any([self.data_train, self.data_val, self.data_test]):
+            return basic
+
+        x, y = next(iter(self.train_dataloader()))
+        xt, yt = next(iter(self.test_dataloader()))
+        data = (
+            "Train/val/test sizes: "
+            f"{len(self.data_train)}, "
+            f"{len(self.data_val)}, "
+            f"{len(self.data_test)}\n"
+            f"Train Batch x stats: {(x.shape, x.dtype, x.min(), x.mean(), x.std(), x.max())}\n"
+            f"Train Batch y stats: {(y.shape, y.dtype, y.min(), y.max())}\n"
+            f"Test Batch x stats: {(xt.shape, xt.dtype, xt.min(), xt.mean(), xt.std(), xt.max())}\n"
+            f"Test Batch y stats: {(yt.shape, yt.dtype, yt.min(), yt.max())}\n"
+        )
+        return basic + data
+
+
+def line_crops_and_labels(iam: IAM, split: str) -> Tuple[List, List]:
+    """Load IAM line labels and regions, and load image crops."""
+    crops = []
+    labels = []
+    for filename in iam.form_filenames:
+        if not iam.split_by_id[filename.stem] == split:
+            continue
+        image = image_utils.read_image_pil(filename)
+        image = ImageOps.grayscale(image)
+        image = ImageOps.invert(image)
+        labels += iam.line_strings_by_id[filename.stem]
+        crops += [
+            image.crop([region[box] for box in ["x1", "y1", "x2", "y2"]])
+            for region in iam.line_regions_by_id[filename.stem]
+        ]
+    if len(crops) != len(labels):
+        raise ValueError("Length of crops does not match length of labels")
+    return crops, labels
+
+
+def save_images_and_labels(
+    crops: Sequence[Image.Image], labels: Sequence[str], split: str, data_dirname: Path
+) -> None:
+    (data_dirname / split).mkdir(parents=True, exist_ok=True)
+
+    with (data_dirname / split / "_labels.json").open(mode="w") as f:
+        json.dump(labels, f)
+
+    for index, crop in enumerate(crops):
+        crop.save(data_dirname / split / f"{index}.png")
+
+
+def load_line_crops_and_labels(split: str, data_dirname: Path) -> Tuple[List, List]:
+    """Load line crops and labels for given split from processed directoru."""
+    with (data_dirname / split / "_labels.json").open(mode="r") as f:
+        labels = json.load(f)
+
+    crop_filenames = sorted(
+        (data_dirname / split).glob("*.png"),
+        key=lambda filename: int(Path(filename).stem),
+    )
+    crops = [
+        image_utils.read_image_pil(filename, grayscale=True)
+        for filename in crop_filenames
+    ]
+
+    if len(crops) != len(labels):
+        raise ValueError("Length of crops does not match length of labels")
+
+    return crops, labels
+
+
+def get_transform(image_width: int, augment: bool = False) -> transforms.Compose:
+    """Augment with brigthness, sligth rotation, slant, translation, scale, and Gaussian noise."""
+
+    def embed_crop(crop: Image, augment: bool = augment, image_width: int = image_width) -> Image:
+        # Crop is PIL.Image of dtype="L" (so value range is [0, 255])
+        image = Image.new("L", (image_width, IMAGE_HEIGHT))
+
+        # Resize crop.
+        crop_width, crop_height = crop.size
+        new_crop_height = IMAGE_HEIGHT
+        new_crop_width = int(new_crop_height * (crop_width / crop_height))
+
+        if augment:
+            # Add random stretching
+            new_crop_width = int(new_crop_width * random.uniform(0.9, 1.1))
+            new_crop_width = min(new_crop_width, image_width)
+        crop_resized = crop.resize(
+            (new_crop_width, new_crop_height), resample=Image.BILINEAR
+        )
+
+        # Embed in image
+        x = min(28, image_width - new_crop_width)
+        y = IMAGE_HEIGHT - new_crop_height
+        image.paste(crop_resized, (x, y))
+
+        return image
+
+    transfroms_list = [transforms.Lambda(embed_crop)]
+
+    if augment:
+        transfroms_list += [
+            transforms.ColorJitter(brightness=(0.8, 1.6)),
+            transforms.RandomAffine(
+                degrees=1,
+                shear=(-30, 20),
+                interpolation=InterpolationMode.BILINEAR,
+                fill=0,
+            ),
+        ]
+    transfroms_list.append(transforms.ToTensor())
+    return transforms.Compose(transfroms_list)
+
+
+def generate_iam_lines() -> None:
+    load_and_print_info(IAMLines)
-- 
cgit v1.2.3-70-g09d2