summaryrefslogtreecommitdiff
path: root/notebooks
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2021-04-08 23:38:03 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2021-04-08 23:38:03 +0200
commite388cd95c77d37a51324cff9d84a809421bf97d3 (patch)
treed585545f85d03ea8a6907daba254821fddeb1589 /notebooks
parentf4629a0d4149d5870c9fd8ce83ff5d391bd7ddd3 (diff)
Bug fixes word pieces
Diffstat (limited to 'notebooks')
-rw-r--r--notebooks/03-look-at-iam-paragraphs.ipynb226
1 files changed, 211 insertions, 15 deletions
diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb
index df92f99..4b82034 100644
--- a/notebooks/03-look-at-iam-paragraphs.ipynb
+++ b/notebooks/03-look-at-iam-paragraphs.ipynb
@@ -2,19 +2,10 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 1,
"id": "6ce2519f",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The autoreload extension is already loaded. To reload it, use:\n",
- " %reload_ext autoreload\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import os\n",
"os.environ['CUDA_VISIBLE_DEVICE'] = ''\n",
@@ -39,7 +30,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 162,
"id": "726ac25b",
"metadata": {},
"outputs": [],
@@ -56,7 +47,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"id": "42501428",
"metadata": {},
"outputs": [
@@ -64,7 +55,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2021-04-03 21:55:37.196 | INFO | text_recognizer.data.iam_paragraphs:setup:104 - Loading IAM paragraph regions and lines for None...\n"
+ "2021-04-08 21:48:18.431 | INFO | text_recognizer.data.iam_paragraphs:setup:106 - Loading IAM paragraph regions and lines for None...\n"
]
},
{
@@ -76,7 +67,7 @@
"Input dims: (1, 576, 640)\n",
"Output dims: (682, 1)\n",
"Train/val/test sizes: 1046, 262, 231\n",
- "Train Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0358), tensor(0.1021), tensor(1.))\n",
+ "Train Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0371), tensor(0.1049), tensor(1.))\n",
"Train Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n",
"Test Batch x stats: (torch.Size([128, 1, 576, 640]), torch.float32, tensor(0.), tensor(0.0284), tensor(0.0846), tensor(0.9373))\n",
"Test Batch y stats: (torch.Size([128, 682]), torch.int64, tensor(1), tensor(83))\n",
@@ -93,6 +84,211 @@
},
{
"cell_type": "code",
+ "execution_count": 163,
+ "id": "0cf22683",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x, y = dataset.data_train[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "id": "98dd0ee6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([ 1, 33, 47, 44, 66, 51, 40, 59, 59, 44, 57, 66, 43, 54, 66, 53, 54, 59,\n",
+ " 66, 57, 44, 46, 40, 57, 43, 66, 59, 47, 44, 52, 58, 44, 51, 61, 44, 58,\n",
+ " 66, 40, 58, 66, 44, 63, 55, 44, 57, 59, 83, 40, 43, 61, 48, 58, 44, 57,\n",
+ " 58, 76, 66, 41, 60, 59, 66, 40, 57, 44, 66, 55, 57, 44, 55, 40, 57, 44,\n",
+ " 43, 66, 59, 54, 66, 58, 44, 44, 50, 66, 54, 60, 59, 66, 59, 47, 44, 83,\n",
+ " 40, 55, 55, 57, 54, 55, 57, 48, 40, 59, 44, 66, 58, 54, 60, 57, 42, 44,\n",
+ " 58, 66, 54, 45, 66, 48, 53, 45, 54, 57, 52, 40, 59, 48, 54, 53, 66, 54,\n",
+ " 57, 66, 40, 43, 61, 48, 42, 44, 78, 83, 33, 54, 62, 40, 57, 43, 58, 66,\n",
+ " 59, 47, 44, 66, 44, 53, 43, 66, 54, 45, 66, 5, 13, 9, 10, 76, 66, 26,\n",
+ " 57, 78, 66, 17, 40, 53, 48, 44, 51, 66, 20, 57, 40, 53, 59, 76, 83, 40,\n",
+ " 53, 66, 18, 52, 55, 51, 54, 64, 44, 44, 66, 31, 44, 51, 40, 59, 48, 54,\n",
+ " 53, 58, 66, 28, 45, 45, 48, 42, 44, 57, 66, 54, 45, 83, 31, 54, 51, 51,\n",
+ " 58, 77, 31, 54, 64, 42, 44, 66, 25, 59, 43, 78, 66, 40, 53, 43, 66, 40,\n",
+ " 66, 52, 44, 52, 41, 44, 57, 66, 54, 45, 66, 59, 47, 44, 83, 36, 54, 57,\n",
+ " 50, 44, 57, 58, 71, 66, 18, 43, 60, 42, 40, 59, 48, 54, 53, 40, 51, 66,\n",
+ " 14, 58, 58, 54, 42, 48, 40, 59, 48, 54, 53, 76, 66, 58, 60, 41, 52, 48,\n",
+ " 59, 59, 44, 43, 83, 59, 54, 66, 59, 47, 44, 66, 25, 54, 57, 43, 66, 29,\n",
+ " 57, 54, 61, 54, 58, 59, 66, 54, 45, 66, 20, 51, 40, 58, 46, 54, 62, 76,\n",
+ " 66, 17, 57, 78, 66, 14, 53, 43, 57, 44, 62, 83, 21, 54, 54, 43, 76, 66,\n",
+ " 40, 66, 42, 54, 55, 64, 66, 54, 45, 66, 47, 48, 58, 66, 57, 44, 55, 54,\n",
+ " 57, 59, 66, 54, 53, 66, 40, 53, 66, 44, 53, 56, 60, 48, 57, 64, 83, 47,\n",
+ " 44, 66, 47, 40, 43, 66, 52, 40, 43, 44, 66, 48, 53, 59, 54, 66, 59, 47,\n",
+ " 44, 66, 55, 57, 54, 41, 51, 44, 52, 58, 66, 59, 47, 40, 59, 66, 41, 44,\n",
+ " 58, 44, 59, 83, 54, 51, 43, 44, 57, 66, 62, 54, 57, 50, 44, 57, 58, 66,\n",
+ " 40, 53, 43, 66, 59, 47, 44, 66, 44, 45, 45, 44, 42, 59, 58, 66, 54, 45,\n",
+ " 66, 57, 44, 59, 48, 57, 44, 52, 44, 53, 59, 83, 21, 60, 46, 44, 53, 59,\n",
+ " 54, 41, 51, 44, 57, 66, 31, 54, 46, 44, 57, 2, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n",
+ " 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])"
+ ]
+ },
+ "execution_count": 164,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "id": "45649194",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from text_recognizer.data.iam_preprocessor import Preprocessor\n",
+ "from pathlib import Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "id": "0fc13f9f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "processor = Preprocessor(\n",
+ " data_dir=Path(\"../data/downloaded/iam/iamdb\"),\n",
+ " num_features=1000,\n",
+ " lexicon_path=Path(\"../data/processed/iam_lines/iamdb_1kwp_lex_1000.txt\"),\n",
+ " tokens_path=Path(\"../data/processed/iam_lines/iamdb_1kwp_tokens_1000.txt\"),\n",
+ " use_words=True,\n",
+ " prepend_wordsep=False,\n",
+ " special_tokens=[\"<s>\", \"<e>\", \"<p>\", \"\\n\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "id": "d08a0259",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t = convert_y_label_to_string(y, dataset.mapping)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "id": "a16a2cb7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\"<s>The latter do not regard themselves as expert\\nadvisers, but are prepared to seek out the\\nappropriate sources of information or advice.\\nTowards the end of 1956, Mr. Daniel Grant,\\nan Employee Relations Officer of\\nRolls-Royce Ltd. and a member of the\\nWorkers' Educational Association, submitted\\nto the Lord Provost of Glasgow, Dr. Andrew\\nHood, a copy of his report on an enquiry\\nhe had made into the problems that beset\\nolder workers and the effects of retirement\\nHugentobler Roger<e>\""
+ ]
+ },
+ "execution_count": 168,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "t"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "id": "c7a33b2d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ii = processor.to_index(t.replace(\" \", \"▁\").lower())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "id": "4e0a22f4",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([247])"
+ ]
+ },
+ "execution_count": 171,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ii.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "id": "bc1c5ffb",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([247])"
+ ]
+ },
+ "execution_count": 172,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ii.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 176,
+ "id": "8b7b0373",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\"<s>▁the▁latter▁do▁not▁regard▁themselves▁as▁expert\\n▁advisers,▁but▁are▁prepared▁to▁seek▁out▁the\\n▁appropriate▁sources▁of▁information▁or▁advice.\\n▁towards▁the▁end▁of▁1956,▁mr.▁daniel▁grant,\\n▁an▁employee▁relations▁officer▁of\\n▁rolls-royce▁ltd.▁and▁a▁member▁of▁the\\n▁workers'▁educational▁association,▁submitted\\n▁to▁the▁lord▁provost▁of▁glasgow,▁dr.▁andrew\\n▁hood,▁a▁copy▁of▁his▁report▁on▁an▁enquiry\\n▁he▁had▁made▁into▁the▁problems▁that▁beset\\n▁older▁workers▁and▁the▁effects▁of▁retirement\\n▁hugentobler▁roger<e>\""
+ ]
+ },
+ "execution_count": 176,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "processor.to_text(ii)"
+ ]
+ },
+ {
+ "cell_type": "code",
"execution_count": 4,
"id": "e7778ae2",
"metadata": {