From 0ab820d3595e4f17d4f1f3c310e186692f65cc67 Mon Sep 17 00:00:00 2001
From: Gustaf Rydholm <gustaf.rydholm@gmail.com>
Date: Sun, 11 Apr 2021 21:48:34 +0200
Subject: Working on mapping

---
 notebooks/03-look-at-iam-paragraphs.ipynb | 90 +++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 5 deletions(-)

(limited to 'notebooks')

diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb
index 4b82034..cfa0ba5 100644
--- a/notebooks/03-look-at-iam-paragraphs.ipynb
+++ b/notebooks/03-look-at-iam-paragraphs.ipynb
@@ -25,12 +25,13 @@
     "    sys.path.append('..')\n",
     "\n",
     "from text_recognizer.data.iam_paragraphs import IAMParagraphs\n",
-    "from text_recognizer.data.iam_synthetic_paragraphs import IAMSyntheticParagraphs"
+    "from text_recognizer.data.iam_synthetic_paragraphs import IAMSyntheticParagraphs\n",
+    "from text_recognizer.data.iam_extended_paragraphs import IAMExtendedParagraphs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 162,
+   "execution_count": 2,
    "id": "726ac25b",
    "metadata": {},
    "outputs": [],
@@ -45,6 +46,65 @@
     "    return ''.join([mapping[i] for i in y if i != padding_index])"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c6188bce",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-04-11 21:49:35.313 | INFO     | text_recognizer.data.iam_paragraphs:setup:106 - Loading IAM paragraph regions and lines for None...\n",
+      "2021-04-11 21:49:51.802 | INFO     | text_recognizer.data.iam_synthetic_paragraphs:setup:77 - IAM Synthetic dataset steup for stage None\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "IAM Original and Synthetic Paragraphs Dataset\n",
+      "Num classes: 84\n",
+      "Dims: (1, 576, 640)\n",
+      "Output dims: (682, 1)\n",
+      "Train/val/test sizes: 19942, 262, 231\n",
+      "Train Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0099), tensor(0.0553), tensor(1.))\n",
+      "Train Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n",
+      "Test Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0284), tensor(0.0846), tensor(0.9373))\n",
+      "Test Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = IAMExtendedParagraphs()\n",
+    "dataset.prepare_data()\n",
+    "dataset.setup()\n",
+    "print(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1b3c7bdd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1246.375"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "19942 / 16"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -152,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 165,
+   "execution_count": 5,
    "id": "45649194",
    "metadata": {},
    "outputs": [],
@@ -163,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 166,
+   "execution_count": 6,
    "id": "0fc13f9f",
    "metadata": {},
    "outputs": [],
@@ -179,6 +239,27 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fb0afccf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1004"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(processor.tokens)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 167,
@@ -435,7 +516,6 @@
     }
    ],
    "source": [
-    "\n",
     "# Testing\n",
     "\n",
     "for _ in range(5):\n",
-- 
cgit v1.2.3-70-g09d2