"""Emnist Lines dataset: synthetic handwritten lines dataset made from Emnist characters."""

from collections import defaultdict
from pathlib import Path
from typing import Callable, Dict, List, Optional, Tuple, Union

import click
import h5py
from loguru import logger
import numpy as np
import torch
from torch import Tensor
import torch.nn.functional as F
from torchvision.transforms import ToTensor

from text_recognizer.datasets.dataset import Dataset
from text_recognizer.datasets.emnist_dataset import EmnistDataset, Transpose
from text_recognizer.datasets.sentence_generator import SentenceGenerator
from text_recognizer.datasets.util import (
    DATA_DIRNAME,
    EmnistMapper,
    ESSENTIALS_FILENAME,
)

DATA_DIRNAME = DATA_DIRNAME / "processed" / "emnist_lines"

MAX_WIDTH = 952


class EmnistLinesDataset(Dataset):
    """Synthetic dataset of lines from the Brown corpus with Emnist characters."""

    def __init__(
        self,
        train: bool = False,
        transform: Optional[Callable] = None,
        target_transform: Optional[Callable] = None,
        subsample_fraction: float = None,
        max_length: int = 34,
        min_overlap: float = 0,
        max_overlap: float = 0.33,
        num_samples: int = 10000,
        seed: int = 4711,
        init_token: Optional[str] = None,
        pad_token: Optional[str] = None,
        eos_token: Optional[str] = None,
        lower: bool = False,
    ) -> None:
        """Set attributes and loads the dataset.

        Args:
            train (bool): Flag for the filename. Defaults to False. Defaults to None.
            transform (Optional[Callable]): The transform of the data. Defaults to None.
            target_transform (Optional[Callable]): The transform of the target. Defaults to None.
            subsample_fraction (float): The fraction of the dataset to use for training. Defaults to None.
            max_length (int): The maximum number of characters. Defaults to 34.
            min_overlap (float): The minimum overlap between concatenated images. Defaults to 0.
            max_overlap (float): The maximum overlap between concatenated images. Defaults to 0.33.
            num_samples (int): Number of samples to generate. Defaults to 10000.
            seed (int): Seed number. Defaults to 4711.
            init_token (Optional[str]): String representing the start of sequence token. Defaults to None.
            pad_token (Optional[str]): String representing the pad token. Defaults to None.
            eos_token (Optional[str]): String representing the end of sequence token. Defaults to None.
            lower (bool): If True, convert uppercase letters to lowercase. Otherwise, use both upper and lowercase.

        """
        self.pad_token = "_" if pad_token is None else pad_token

        super().__init__(
            train=train,
            transform=transform,
            target_transform=target_transform,
            subsample_fraction=subsample_fraction,
            init_token=init_token,
            pad_token=self.pad_token,
            eos_token=eos_token,
            lower=lower,
        )

        # Extract dataset information.
        self._input_shape = self._mapper.input_shape
        self.num_classes = self._mapper.num_classes

        self.max_length = max_length
        self.min_overlap = min_overlap
        self.max_overlap = max_overlap
        self.num_samples = num_samples
        self._input_shape = (
            self.input_shape[0],
            self.input_shape[1] * self.max_length,
        )
        self._output_shape = (self.max_length, self.num_classes)
        self.seed = seed

        # Placeholders for the dataset.
        self._data = None
        self._target = None

    def __getitem__(self, index: Union[int, Tensor]) -> Tuple[Tensor, Tensor]:
        """Fetches data, target pair of the dataset for a given and index or indices.

        Args:
            index (Union[int, Tensor]): Either a list or int of indices/index.

        Returns:
            Tuple[Tensor, Tensor]: Data target pair.

        """
        if torch.is_tensor(index):
            index = index.tolist()

        data = self.data[index]
        targets = self.targets[index]

        if self.transform:
            data = self.transform(data)

        if self.target_transform:
            targets = self.target_transform(targets)

        return data, targets

    def __repr__(self) -> str:
        """Returns information about the dataset."""
        return (
            "EMNIST Lines Dataset\n"  # pylint: disable=no-member
            f"Max length: {self.max_length}\n"
            f"Min overlap: {self.min_overlap}\n"
            f"Max overlap: {self.max_overlap}\n"
            f"Num classes: {self.num_classes}\n"
            f"Input shape: {self.input_shape}\n"
            f"Data: {self.data.shape}\n"
            f"Tagets: {self.targets.shape}\n"
        )

    @property
    def data_filename(self) -> Path:
        """Path to the h5 file."""
        filename = "train.pt" if self.train else "test.pt"
        return DATA_DIRNAME / filename

    def load_or_generate_data(self) -> None:
        """Loads the dataset, if it does not exist a new dataset is generated before loading it."""
        np.random.seed(self.seed)

        if not self.data_filename.exists():
            self._generate_data()
        self._load_data()
        self._subsample()

    def _load_data(self) -> None:
        """Loads the dataset from the h5 file."""
        logger.debug("EmnistLinesDataset loading data from HDF5...")
        with h5py.File(self.data_filename, "r") as f:
            self._data = f["data"][()]
            self._targets = f["targets"][()]

    def _generate_data(self) -> str:
        """Generates a dataset with the Brown corpus and Emnist characters."""
        logger.debug("Generating data...")

        sentence_generator = SentenceGenerator(self.max_length)

        # Load emnist dataset.
        emnist = EmnistDataset(
            train=self.train, sample_to_balance=True, pad_token=self.pad_token
        )
        emnist.load_or_generate_data()

        samples_by_character = get_samples_by_character(
            emnist.data.numpy(), emnist.targets.numpy(), self.mapper.mapping,
        )

        DATA_DIRNAME.mkdir(parents=True, exist_ok=True)
        with h5py.File(self.data_filename, "a") as f:
            data, targets = create_dataset_of_images(
                self.num_samples,
                samples_by_character,
                sentence_generator,
                self.min_overlap,
                self.max_overlap,
            )

            targets = convert_strings_to_categorical_labels(
                targets, emnist.inverse_mapping
            )

            f.create_dataset("data", data=data, dtype="u1", compression="lzf")
            f.create_dataset("targets", data=targets, dtype="u1", compression="lzf")


def get_samples_by_character(
    samples: np.ndarray, labels: np.ndarray, mapping: Dict
) -> defaultdict:
    """Creates a dictionary with character as key and value as the list of images of that character.

    Args:
        samples (np.ndarray): Dataset of images of characters.
        labels (np.ndarray): The labels for each image.
        mapping (Dict): The Emnist mapping dictionary.

    Returns:
        defaultdict: A dictionary with characters as keys and list of images as values.

    """
    samples_by_character = defaultdict(list)
    for sample, label in zip(samples, labels.flatten()):
        samples_by_character[mapping[label]].append(sample)
    return samples_by_character


def select_letter_samples_for_string(
    string: str, samples_by_character: Dict
) -> List[np.ndarray]:
    """Randomly selects Emnist characters to use for the senetence.

    Args:
        string (str): The word or sentence.
        samples_by_character (Dict): The dictionary of emnist images of each character.

    Returns:
        List[np.ndarray]: A list of emnist images of the string.

    """
    zero_image = np.zeros((28, 28), np.uint8)
    sample_image_by_character = {}
    for character in string:
        if character in sample_image_by_character:
            continue
        samples = samples_by_character[character]
        sample = samples[np.random.choice(len(samples))] if samples else zero_image
        sample_image_by_character[character] = sample.reshape(28, 28).swapaxes(0, 1)
    return [sample_image_by_character[character] for character in string]


def construct_image_from_string(
    string: str, samples_by_character: Dict, min_overlap: float, max_overlap: float
) -> np.ndarray:
    """Concatenates images of the characters in the string.

    The concatination is made with randomly selected overlap so that some portion of the character will overlap.

    Args:
        string (str): The word or sentence.
        samples_by_character (Dict): The dictionary of emnist images of each character.
        min_overlap (float): Minimum amount of overlap between Emnist images.
        max_overlap (float): Maximum amount of overlap between Emnist images.

    Returns:
        np.ndarray: The Emnist image of the string.

    """
    overlap = np.random.uniform(min_overlap, max_overlap)
    sampled_images = select_letter_samples_for_string(string, samples_by_character)
    length = len(sampled_images)
    height, width = sampled_images[0].shape
    next_overlap_width = width - int(overlap * width)
    concatenated_image = np.zeros((height, width * length), np.uint8)
    x = 0
    for image in sampled_images:
        concatenated_image[:, x : (x + width)] += image
        x += next_overlap_width

    if concatenated_image.shape[-1] > MAX_WIDTH:
        concatenated_image = Tensor(concatenated_image).unsqueeze(0)
        concatenated_image = F.interpolate(
            concatenated_image, size=MAX_WIDTH, mode="nearest"
        )
        concatenated_image = concatenated_image.squeeze(0).numpy()

    return np.minimum(255, concatenated_image)


def create_dataset_of_images(
    length: int,
    samples_by_character: Dict,
    sentence_generator: SentenceGenerator,
    min_overlap: float,
    max_overlap: float,
) -> Tuple[np.ndarray, List[str]]:
    """Creates a dataset with images and labels from strings generated from the SentenceGenerator.

    Args:
        length (int): The number of characters for each string.
        samples_by_character (Dict): The dictionary of emnist images of each character.
        sentence_generator (SentenceGenerator): A SentenceGenerator objest.
        min_overlap (float): Minimum amount of overlap between Emnist images.
        max_overlap (float): Maximum amount of overlap between Emnist images.

    Returns:
        Tuple[np.ndarray, List[str]]: A list of Emnist images and a list of the strings (labels).

    Raises:
        RuntimeError: If the sentence generator is not able to generate a string.

    """
    sample_label = sentence_generator.generate()
    sample_image = construct_image_from_string(sample_label, samples_by_character, 0, 0)
    images = np.zeros((length, sample_image.shape[0], sample_image.shape[1]), np.uint8)
    labels = []
    for n in range(length):
        label = None
        # Try several times to generate before actually throwing an error.
        for _ in range(10):
            try:
                label = sentence_generator.generate()
                break
            except Exception:  # pylint: disable=broad-except
                pass
        if label is None:
            raise RuntimeError("Was not able to generate a valid string.")
        images[n] = construct_image_from_string(
            label, samples_by_character, min_overlap, max_overlap
        )
        labels.append(label)
    return images, labels


def convert_strings_to_categorical_labels(
    labels: List[str], mapping: Dict
) -> np.ndarray:
    """Translates a string of characters in to a target array of class int."""
    return np.array([[mapping[c] for c in label] for label in labels])


@click.command()
@click.option(
    "--max_length", type=int, default=34, help="Number of characters in a sentence."
)
@click.option(
    "--min_overlap", type=float, default=0.0, help="Min overlap between characters."
)
@click.option(
    "--max_overlap", type=float, default=0.33, help="Max overlap between characters."
)
@click.option("--num_train", type=int, default=10_000, help="Number of train examples.")
@click.option("--num_test", type=int, default=1_000, help="Number of test examples.")
def create_datasets(
    max_length: int = 34,
    min_overlap: float = 0,
    max_overlap: float = 0.33,
    num_train: int = 10000,
    num_test: int = 1000,
) -> None:
    """Creates a training an validation dataset of Emnist lines."""
    num_samples = [num_train, num_test]
    for num, train in zip(num_samples, [True, False]):
        emnist_lines = EmnistLinesDataset(
            train=train,
            max_length=max_length,
            min_overlap=min_overlap,
            max_overlap=max_overlap,
            num_samples=num,
        )
        emnist_lines.load_or_generate_data()


if __name__ == "__main__":
    create_datasets()