diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2021-04-08 23:38:03 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2021-04-08 23:38:03 +0200 |
commit | e388cd95c77d37a51324cff9d84a809421bf97d3 (patch) | |
tree | d585545f85d03ea8a6907daba254821fddeb1589 /notebooks | |
parent | f4629a0d4149d5870c9fd8ce83ff5d391bd7ddd3 (diff) |
Bug fixes word pieces
Diffstat (limited to 'notebooks')
-rw-r--r-- | notebooks/03-look-at-iam-paragraphs.ipynb | 226 |
1 files changed, 211 insertions, 15 deletions
diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb index df92f99..4b82034 100644 --- a/notebooks/03-look-at-iam-paragraphs.ipynb +++ b/notebooks/03-look-at-iam-paragraphs.ipynb @@ -2,19 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "id": "6ce2519f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "os.environ['CUDA_VISIBLE_DEVICE'] = ''\n", @@ -39,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 162, "id": "726ac25b", "metadata": {}, "outputs": [], @@ -56,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "42501428", "metadata": {}, "outputs": [ @@ -64,7 +55,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-04-03 21:55:37.196 | INFO | text_recognizer.data.iam_paragraphs:setup:104 - Loading IAM paragraph regions and lines for None...\n" + "2021-04-08 21:48:18.431 | INFO | text_recognizer.data.iam_paragraphs:setup:106 - Loading IAM paragraph regions and lines for None...\n" ] }, { @@ -76,7 +67,7 @@ "Input dims: (1, 576, 640)\n", "Output dims: (682, 1)\n", "Train/val/test sizes: 1046, 262, 231\n", - "Train Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0358), tensor(0.1021), tensor(1.))\n", + "Train Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0371), tensor(0.1049), tensor(1.))\n", "Train Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n", "Test Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0284), tensor(0.0846), tensor(0.9373))\n", "Test Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n", @@ -93,6 +84,211 @@ }, { "cell_type": "code", + "execution_count": 163, + "id": "0cf22683", + "metadata": {}, + "outputs": [], + "source": [ + "x, y = dataset.data_train[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "98dd0ee6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 1, 33, 47, 44, 66, 51, 40, 59, 59, 44, 57, 66, 43, 54, 66, 53, 54, 59,\n", + " 66, 57, 44, 46, 40, 57, 43, 66, 59, 47, 44, 52, 58, 44, 51, 61, 44, 58,\n", + " 66, 40, 58, 66, 44, 63, 55, 44, 57, 59, 83, 40, 43, 61, 48, 58, 44, 57,\n", + " 58, 76, 66, 41, 60, 59, 66, 40, 57, 44, 66, 55, 57, 44, 55, 40, 57, 44,\n", + " 43, 66, 59, 54, 66, 58, 44, 44, 50, 66, 54, 60, 59, 66, 59, 47, 44, 83,\n", + " 40, 55, 55, 57, 54, 55, 57, 48, 40, 59, 44, 66, 58, 54, 60, 57, 42, 44,\n", + " 58, 66, 54, 45, 66, 48, 53, 45, 54, 57, 52, 40, 59, 48, 54, 53, 66, 54,\n", + " 57, 66, 40, 43, 61, 48, 42, 44, 78, 83, 33, 54, 62, 40, 57, 43, 58, 66,\n", + " 59, 47, 44, 66, 44, 53, 43, 66, 54, 45, 66, 5, 13, 9, 10, 76, 66, 26,\n", + " 57, 78, 66, 17, 40, 53, 48, 44, 51, 66, 20, 57, 40, 53, 59, 76, 83, 40,\n", + " 53, 66, 18, 52, 55, 51, 54, 64, 44, 44, 66, 31, 44, 51, 40, 59, 48, 54,\n", + " 53, 58, 66, 28, 45, 45, 48, 42, 44, 57, 66, 54, 45, 83, 31, 54, 51, 51,\n", + " 58, 77, 31, 54, 64, 42, 44, 66, 25, 59, 43, 78, 66, 40, 53, 43, 66, 40,\n", + " 66, 52, 44, 52, 41, 44, 57, 66, 54, 45, 66, 59, 47, 44, 83, 36, 54, 57,\n", + " 50, 44, 57, 58, 71, 66, 18, 43, 60, 42, 40, 59, 48, 54, 53, 40, 51, 66,\n", + " 14, 58, 58, 54, 42, 48, 40, 59, 48, 54, 53, 76, 66, 58, 60, 41, 52, 48,\n", + " 59, 59, 44, 43, 83, 59, 54, 66, 59, 47, 44, 66, 25, 54, 57, 43, 66, 29,\n", + " 57, 54, 61, 54, 58, 59, 66, 54, 45, 66, 20, 51, 40, 58, 46, 54, 62, 76,\n", + " 66, 17, 57, 78, 66, 14, 53, 43, 57, 44, 62, 83, 21, 54, 54, 43, 76, 66,\n", + " 40, 66, 42, 54, 55, 64, 66, 54, 45, 66, 47, 48, 58, 66, 57, 44, 55, 54,\n", + " 57, 59, 66, 54, 53, 66, 40, 53, 66, 44, 53, 56, 60, 48, 57, 64, 83, 47,\n", + " 44, 66, 47, 40, 43, 66, 52, 40, 43, 44, 66, 48, 53, 59, 54, 66, 59, 47,\n", + " 44, 66, 55, 57, 54, 41, 51, 44, 52, 58, 66, 59, 47, 40, 59, 66, 41, 44,\n", + " 58, 44, 59, 83, 54, 51, 43, 44, 57, 66, 62, 54, 57, 50, 44, 57, 58, 66,\n", + " 40, 53, 43, 66, 59, 47, 44, 66, 44, 45, 45, 44, 42, 59, 58, 66, 54, 45,\n", + " 66, 57, 44, 59, 48, 57, 44, 52, 44, 53, 59, 83, 21, 60, 46, 44, 53, 59,\n", + " 54, 41, 51, 44, 57, 66, 31, 54, 46, 44, 57, 2, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "45649194", + "metadata": {}, + "outputs": [], + "source": [ + "from text_recognizer.data.iam_preprocessor import Preprocessor\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "id": "0fc13f9f", + "metadata": {}, + "outputs": [], + "source": [ + "processor = Preprocessor(\n", + " data_dir=Path(\"../data/downloaded/iam/iamdb\"),\n", + " num_features=1000,\n", + " lexicon_path=Path(\"../data/processed/iam_lines/iamdb_1kwp_lex_1000.txt\"),\n", + " tokens_path=Path(\"../data/processed/iam_lines/iamdb_1kwp_tokens_1000.txt\"),\n", + " use_words=True,\n", + " prepend_wordsep=False,\n", + " special_tokens=[\"<s>\", \"<e>\", \"<p>\", \"\\n\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "id": "d08a0259", + "metadata": {}, + "outputs": [], + "source": [ + "t = convert_y_label_to_string(y, dataset.mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "id": "a16a2cb7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"<s>The latter do not regard themselves as expert\\nadvisers, but are prepared to seek out the\\nappropriate sources of information or advice.\\nTowards the end of 1956, Mr. Daniel Grant,\\nan Employee Relations Officer of\\nRolls-Royce Ltd. and a member of the\\nWorkers' Educational Association, submitted\\nto the Lord Provost of Glasgow, Dr. Andrew\\nHood, a copy of his report on an enquiry\\nhe had made into the problems that beset\\nolder workers and the effects of retirement\\nHugentobler Roger<e>\"" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "c7a33b2d", + "metadata": {}, + "outputs": [], + "source": [ + "ii = processor.to_index(t.replace(\" \", \"▁\").lower())" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "4e0a22f4", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([247])" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ii.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "id": "bc1c5ffb", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([247])" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ii.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "8b7b0373", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"<s>▁the▁latter▁do▁not▁regard▁themselves▁as▁expert\\n▁advisers,▁but▁are▁prepared▁to▁seek▁out▁the\\n▁appropriate▁sources▁of▁information▁or▁advice.\\n▁towards▁the▁end▁of▁1956,▁mr.▁daniel▁grant,\\n▁an▁employee▁relations▁officer▁of\\n▁rolls-royce▁ltd.▁and▁a▁member▁of▁the\\n▁workers'▁educational▁association,▁submitted\\n▁to▁the▁lord▁provost▁of▁glasgow,▁dr.▁andrew\\n▁hood,▁a▁copy▁of▁his▁report▁on▁an▁enquiry\\n▁he▁had▁made▁into▁the▁problems▁that▁beset\\n▁older▁workers▁and▁the▁effects▁of▁retirement\\n▁hugentobler▁roger<e>\"" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processor.to_text(ii)" + ] + }, + { + "cell_type": "code", "execution_count": 4, "id": "e7778ae2", "metadata": { |