import random from typing import Any, Dict import torchvision.transforms as T from PIL import Image import text_recognizer.metadata.iam_lines as metadata from text_recognizer.data.transforms.image import ImageStem class LineStem(ImageStem): """A stem for handling images containing a line of text.""" def __init__( self, augment: bool = False, color_jitter_kwargs: Dict[str, Any] = None, random_affine_kwargs: Dict[str, Any] = None, ) -> None: super().__init__() if color_jitter_kwargs is None: color_jitter_kwargs = {"brightness": (0.5, 1)} if random_affine_kwargs is None: random_affine_kwargs = { "degrees": 3, "translate": (0, 0.05), "scale": (0.4, 1.1), "shear": (-40, 50), "interpolation": T.InterpolationMode.BILINEAR, "fill": 0, } if augment: self.pil_transforms = T.Compose( [ T.ColorJitter(**color_jitter_kwargs), T.RandomAffine(**random_affine_kwargs), ] ) class IamLinesStem(ImageStem): """A stem for handling images containing lines of text from the IAMLines dataset.""" def __init__( self, augment: bool = False, color_jitter_kwargs: Dict[str, Any] = None, random_affine_kwargs: Dict[str, Any] = None, ) -> None: super().__init__() def embed_crop(crop, augment=augment): # crop is PIL.image of dtype="L" (so values range from 0 -> 255) image = Image.new("L", (metadata.IMAGE_WIDTH, metadata.IMAGE_HEIGHT)) # Resize crop crop_width, crop_height = crop.size new_crop_height = metadata.IMAGE_HEIGHT new_crop_width = int(new_crop_height * (crop_width / crop_height)) if augment: # Add random stretching new_crop_width = int(new_crop_width * random.uniform(0.9, 1.1)) new_crop_width = min(new_crop_width, metadata.IMAGE_WIDTH) crop_resized = crop.resize( (new_crop_width, new_crop_height), resample=Image.BILINEAR ) # Embed in the image x = min(metadata.CHAR_WIDTH, metadata.IMAGE_WIDTH - new_crop_width) y = metadata.IMAGE_HEIGHT - new_crop_height image.paste(crop_resized, (x, y)) return image if color_jitter_kwargs is None: color_jitter_kwargs = {"brightness": (0.8, 1.6)} if random_affine_kwargs is None: random_affine_kwargs = { "degrees": 1, "shear": (-30, 20), "interpolation": T.InterpolationMode.BILINEAR, "fill": 0, } pil_transform_list = [T.Lambda(embed_crop)] if augment: pil_transform_list += [ T.ColorJitter(**color_jitter_kwargs), T.RandomAffine(**random_affine_kwargs), ] self.pil_transform = T.Compose(pil_transform_list)