diff options
Diffstat (limited to 'notebooks')
| -rw-r--r-- | notebooks/03-look-at-iam-paragraphs.ipynb | 226 | 
1 files changed, 211 insertions, 15 deletions
| diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb index df92f99..4b82034 100644 --- a/notebooks/03-look-at-iam-paragraphs.ipynb +++ b/notebooks/03-look-at-iam-paragraphs.ipynb @@ -2,19 +2,10 @@   "cells": [    {     "cell_type": "code", -   "execution_count": 7, +   "execution_count": 1,     "id": "6ce2519f",     "metadata": {}, -   "outputs": [ -    { -     "name": "stdout", -     "output_type": "stream", -     "text": [ -      "The autoreload extension is already loaded. To reload it, use:\n", -      "  %reload_ext autoreload\n" -     ] -    } -   ], +   "outputs": [],     "source": [      "import os\n",      "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n", @@ -39,7 +30,7 @@    },    {     "cell_type": "code", -   "execution_count": 2, +   "execution_count": 162,     "id": "726ac25b",     "metadata": {},     "outputs": [], @@ -56,7 +47,7 @@    },    {     "cell_type": "code", -   "execution_count": 3, +   "execution_count": 4,     "id": "42501428",     "metadata": {},     "outputs": [ @@ -64,7 +55,7 @@       "name": "stderr",       "output_type": "stream",       "text": [ -      "2021-04-03 21:55:37.196 | INFO     | text_recognizer.data.iam_paragraphs:setup:104 - Loading IAM paragraph regions and lines for None...\n" +      "2021-04-08 21:48:18.431 | INFO     | text_recognizer.data.iam_paragraphs:setup:106 - Loading IAM paragraph regions and lines for None...\n"       ]      },      { @@ -76,7 +67,7 @@        "Input dims: (1, 576, 640)\n",        "Output dims: (682, 1)\n",        "Train/val/test sizes: 1046, 262, 231\n", -      "Train Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0358), tensor(0.1021), tensor(1.))\n", +      "Train Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0371), tensor(0.1049), tensor(1.))\n",        "Train Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n",        "Test Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0284), tensor(0.0846), tensor(0.9373))\n",        "Test Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n", @@ -93,6 +84,211 @@    },    {     "cell_type": "code", +   "execution_count": 163, +   "id": "0cf22683", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "x, y = dataset.data_train[1]" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 164, +   "id": "98dd0ee6", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "tensor([ 1, 33, 47, 44, 66, 51, 40, 59, 59, 44, 57, 66, 43, 54, 66, 53, 54, 59,\n", +       "        66, 57, 44, 46, 40, 57, 43, 66, 59, 47, 44, 52, 58, 44, 51, 61, 44, 58,\n", +       "        66, 40, 58, 66, 44, 63, 55, 44, 57, 59, 83, 40, 43, 61, 48, 58, 44, 57,\n", +       "        58, 76, 66, 41, 60, 59, 66, 40, 57, 44, 66, 55, 57, 44, 55, 40, 57, 44,\n", +       "        43, 66, 59, 54, 66, 58, 44, 44, 50, 66, 54, 60, 59, 66, 59, 47, 44, 83,\n", +       "        40, 55, 55, 57, 54, 55, 57, 48, 40, 59, 44, 66, 58, 54, 60, 57, 42, 44,\n", +       "        58, 66, 54, 45, 66, 48, 53, 45, 54, 57, 52, 40, 59, 48, 54, 53, 66, 54,\n", +       "        57, 66, 40, 43, 61, 48, 42, 44, 78, 83, 33, 54, 62, 40, 57, 43, 58, 66,\n", +       "        59, 47, 44, 66, 44, 53, 43, 66, 54, 45, 66,  5, 13,  9, 10, 76, 66, 26,\n", +       "        57, 78, 66, 17, 40, 53, 48, 44, 51, 66, 20, 57, 40, 53, 59, 76, 83, 40,\n", +       "        53, 66, 18, 52, 55, 51, 54, 64, 44, 44, 66, 31, 44, 51, 40, 59, 48, 54,\n", +       "        53, 58, 66, 28, 45, 45, 48, 42, 44, 57, 66, 54, 45, 83, 31, 54, 51, 51,\n", +       "        58, 77, 31, 54, 64, 42, 44, 66, 25, 59, 43, 78, 66, 40, 53, 43, 66, 40,\n", +       "        66, 52, 44, 52, 41, 44, 57, 66, 54, 45, 66, 59, 47, 44, 83, 36, 54, 57,\n", +       "        50, 44, 57, 58, 71, 66, 18, 43, 60, 42, 40, 59, 48, 54, 53, 40, 51, 66,\n", +       "        14, 58, 58, 54, 42, 48, 40, 59, 48, 54, 53, 76, 66, 58, 60, 41, 52, 48,\n", +       "        59, 59, 44, 43, 83, 59, 54, 66, 59, 47, 44, 66, 25, 54, 57, 43, 66, 29,\n", +       "        57, 54, 61, 54, 58, 59, 66, 54, 45, 66, 20, 51, 40, 58, 46, 54, 62, 76,\n", +       "        66, 17, 57, 78, 66, 14, 53, 43, 57, 44, 62, 83, 21, 54, 54, 43, 76, 66,\n", +       "        40, 66, 42, 54, 55, 64, 66, 54, 45, 66, 47, 48, 58, 66, 57, 44, 55, 54,\n", +       "        57, 59, 66, 54, 53, 66, 40, 53, 66, 44, 53, 56, 60, 48, 57, 64, 83, 47,\n", +       "        44, 66, 47, 40, 43, 66, 52, 40, 43, 44, 66, 48, 53, 59, 54, 66, 59, 47,\n", +       "        44, 66, 55, 57, 54, 41, 51, 44, 52, 58, 66, 59, 47, 40, 59, 66, 41, 44,\n", +       "        58, 44, 59, 83, 54, 51, 43, 44, 57, 66, 62, 54, 57, 50, 44, 57, 58, 66,\n", +       "        40, 53, 43, 66, 59, 47, 44, 66, 44, 45, 45, 44, 42, 59, 58, 66, 54, 45,\n", +       "        66, 57, 44, 59, 48, 57, 44, 52, 44, 53, 59, 83, 21, 60, 46, 44, 53, 59,\n", +       "        54, 41, 51, 44, 57, 66, 31, 54, 46, 44, 57,  2,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,\n", +       "         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3])" +      ] +     }, +     "execution_count": 164, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "y" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 165, +   "id": "45649194", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "from text_recognizer.data.iam_preprocessor import Preprocessor\n", +    "from pathlib import Path" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 166, +   "id": "0fc13f9f", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "processor = Preprocessor(\n", +    "                            data_dir=Path(\"../data/downloaded/iam/iamdb\"),\n", +    "                            num_features=1000,\n", +    "                            lexicon_path=Path(\"../data/processed/iam_lines/iamdb_1kwp_lex_1000.txt\"),\n", +    "                            tokens_path=Path(\"../data/processed/iam_lines/iamdb_1kwp_tokens_1000.txt\"),\n", +    "                            use_words=True,\n", +    "                            prepend_wordsep=False,\n", +    "                            special_tokens=[\"<s>\", \"<e>\", \"<p>\", \"\\n\"]\n", +    ")" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 167, +   "id": "d08a0259", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "t = convert_y_label_to_string(y, dataset.mapping)" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 168, +   "id": "a16a2cb7", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "\"<s>The latter do not regard themselves as expert\\nadvisers, but are prepared to seek out the\\nappropriate sources of information or advice.\\nTowards the end of 1956, Mr. Daniel Grant,\\nan Employee Relations Officer of\\nRolls-Royce Ltd. and a member of the\\nWorkers' Educational Association, submitted\\nto the Lord Provost of Glasgow, Dr. Andrew\\nHood, a copy of his report on an enquiry\\nhe had made into the problems that beset\\nolder workers and the effects of retirement\\nHugentobler Roger<e>\"" +      ] +     }, +     "execution_count": 168, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "t" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 170, +   "id": "c7a33b2d", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "ii = processor.to_index(t.replace(\" \", \"▁\").lower())" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 171, +   "id": "4e0a22f4", +   "metadata": { +    "scrolled": true +   }, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "torch.Size([247])" +      ] +     }, +     "execution_count": 171, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "ii.shape" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 172, +   "id": "bc1c5ffb", +   "metadata": { +    "scrolled": true +   }, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "torch.Size([247])" +      ] +     }, +     "execution_count": 172, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "ii.shape" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 176, +   "id": "8b7b0373", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "\"<s>▁the▁latter▁do▁not▁regard▁themselves▁as▁expert\\n▁advisers,▁but▁are▁prepared▁to▁seek▁out▁the\\n▁appropriate▁sources▁of▁information▁or▁advice.\\n▁towards▁the▁end▁of▁1956,▁mr.▁daniel▁grant,\\n▁an▁employee▁relations▁officer▁of\\n▁rolls-royce▁ltd.▁and▁a▁member▁of▁the\\n▁workers'▁educational▁association,▁submitted\\n▁to▁the▁lord▁provost▁of▁glasgow,▁dr.▁andrew\\n▁hood,▁a▁copy▁of▁his▁report▁on▁an▁enquiry\\n▁he▁had▁made▁into▁the▁problems▁that▁beset\\n▁older▁workers▁and▁the▁effects▁of▁retirement\\n▁hugentobler▁roger<e>\"" +      ] +     }, +     "execution_count": 176, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "processor.to_text(ii)" +   ] +  }, +  { +   "cell_type": "code",     "execution_count": 4,     "id": "e7778ae2",     "metadata": { |