From ffa4be4bf4e3758e01d52a9c1f354a05a90b93de Mon Sep 17 00:00:00 2001
From: Gustaf Rydholm <gustaf.rydholm@gmail.com>
Date: Thu, 15 Apr 2021 22:05:24 +0200
Subject: Created mappings

---
 notebooks/03-look-at-iam-paragraphs.ipynb     | 2126 ++++++++++++++++++++++++-
 notebooks/04b-look-at-iam-paragraphs.ipynb    |    2 +-
 text_recognizer/data/iam_preprocessor.py      |    1 -
 text_recognizer/data/mapping.py               |    8 -
 text_recognizer/data/mappings.py              |  143 ++
 text_recognizer/data/transforms.py            |  111 +-
 text_recognizer/models/base.py                |    6 +-
 text_recognizer/networks/image_transformer.py |    4 +-
 training/configs/image_transformer.yaml       |    1 +
 training/run_experiment.py                    |    5 +-
 10 files changed, 2293 insertions(+), 114 deletions(-)
 delete mode 100644 text_recognizer/data/mapping.py
 create mode 100644 text_recognizer/data/mappings.py
diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb
index cfa0ba5..7599e11 100644
--- a/notebooks/03-look-at-iam-paragraphs.ipynb
+++ b/notebooks/03-look-at-iam-paragraphs.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "6ce2519f",
    "metadata": {},
    "outputs": [],
@@ -212,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "id": "45649194",
    "metadata": {},
    "outputs": [],
@@ -223,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "id": "0fc13f9f",
    "metadata": {},
    "outputs": [],
@@ -239,6 +239,1105 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "c70e8873",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['!',\n",
+       " '\"',\n",
+       " '&',\n",
+       " \"'\",\n",
+       " '(',\n",
+       " ')',\n",
+       " '*',\n",
+       " '+',\n",
+       " ',',\n",
+       " '-',\n",
+       " '.',\n",
+       " '/',\n",
+       " '0',\n",
+       " '1',\n",
+       " '2',\n",
+       " '3',\n",
+       " '4',\n",
+       " '5',\n",
+       " '6',\n",
+       " '7',\n",
+       " '8',\n",
+       " '9',\n",
+       " ':',\n",
+       " ';',\n",
+       " '?',\n",
+       " 'a',\n",
+       " 'b',\n",
+       " 'c',\n",
+       " 'd',\n",
+       " 'e',\n",
+       " 'f',\n",
+       " 'g',\n",
+       " 'h',\n",
+       " 'i',\n",
+       " 'j',\n",
+       " 'k',\n",
+       " 'l',\n",
+       " 'm',\n",
+       " 'n',\n",
+       " 'o',\n",
+       " 'p',\n",
+       " 'q',\n",
+       " 'r',\n",
+       " 's',\n",
+       " 't',\n",
+       " 'u',\n",
+       " 'v',\n",
+       " 'w',\n",
+       " 'x',\n",
+       " 'y',\n",
+       " 'z',\n",
+       " '▁',\n",
+       " '<s>',\n",
+       " '<e>',\n",
+       " '<p>',\n",
+       " '\\n']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processor.graphemes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6def52f6",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['/',\n",
+       " '▁',\n",
+       " 's',\n",
+       " '▁the',\n",
+       " ',',\n",
+       " '▁.',\n",
+       " '▁a',\n",
+       " 'e',\n",
+       " '▁to',\n",
+       " 'ed',\n",
+       " '▁of',\n",
+       " '▁in',\n",
+       " 'ing',\n",
+       " 'y',\n",
+       " '▁and',\n",
+       " 't',\n",
+       " 'm',\n",
+       " 'd',\n",
+       " '▁be',\n",
+       " 'at',\n",
+       " 'a',\n",
+       " 'n',\n",
+       " '▁th',\n",
+       " 'i',\n",
+       " '▁he',\n",
+       " 'p',\n",
+       " 'o',\n",
+       " 'en',\n",
+       " 'al',\n",
+       " '▁is',\n",
+       " '-',\n",
+       " 'ly',\n",
+       " '.',\n",
+       " '▁w',\n",
+       " 'c',\n",
+       " '▁for',\n",
+       " 'l',\n",
+       " '\"',\n",
+       " '▁it',\n",
+       " 'er',\n",
+       " '▁was',\n",
+       " 'u',\n",
+       " '▁on',\n",
+       " '▁with',\n",
+       " \"'\",\n",
+       " '▁his',\n",
+       " 'g',\n",
+       " 'h',\n",
+       " '▁had',\n",
+       " '▁mr',\n",
+       " 'in',\n",
+       " '▁i',\n",
+       " 'or',\n",
+       " '▁f',\n",
+       " 'r',\n",
+       " \"▁'\",\n",
+       " '▁but',\n",
+       " '▁by',\n",
+       " 'ar',\n",
+       " '▁as',\n",
+       " '▁not',\n",
+       " 'b',\n",
+       " 'it',\n",
+       " 'ir',\n",
+       " 'v',\n",
+       " '▁re',\n",
+       " 'st',\n",
+       " '▁an',\n",
+       " '▁has',\n",
+       " '▁are',\n",
+       " 'this',\n",
+       " 'f',\n",
+       " 'on',\n",
+       " 'hich',\n",
+       " 'ce',\n",
+       " '▁who',\n",
+       " '▁have',\n",
+       " 'ion',\n",
+       " 'ch',\n",
+       " '▁p',\n",
+       " 're',\n",
+       " '▁at',\n",
+       " '▁de',\n",
+       " '▁m',\n",
+       " 'w',\n",
+       " 'le',\n",
+       " 'ri',\n",
+       " '▁so',\n",
+       " '▁fro',\n",
+       " 'ation',\n",
+       " 'ic',\n",
+       " '▁do',\n",
+       " 'ould',\n",
+       " '▁will',\n",
+       " '▁b',\n",
+       " 'k',\n",
+       " 'is',\n",
+       " 'ment',\n",
+       " 'il',\n",
+       " '▁st',\n",
+       " '▁no',\n",
+       " 'an',\n",
+       " 'ter',\n",
+       " 'se',\n",
+       " '▁we',\n",
+       " 'ur',\n",
+       " 'ies',\n",
+       " '▁un',\n",
+       " '▁her',\n",
+       " 'th',\n",
+       " 'es',\n",
+       " 'ent',\n",
+       " 'et',\n",
+       " '▁-',\n",
+       " '▁all',\n",
+       " '▁c',\n",
+       " '▁there',\n",
+       " 'un',\n",
+       " 've',\n",
+       " '▁can',\n",
+       " 'ne',\n",
+       " '▁up',\n",
+       " '▁one',\n",
+       " '▁were',\n",
+       " '▁me',\n",
+       " '▁br',\n",
+       " '▁com',\n",
+       " '▁more',\n",
+       " '▁se',\n",
+       " 'ro',\n",
+       " '▁per',\n",
+       " 'ld',\n",
+       " 'ge',\n",
+       " '▁man',\n",
+       " 'est',\n",
+       " '▁pa',\n",
+       " 'ul',\n",
+       " 'el',\n",
+       " 'if',\n",
+       " '▁government',\n",
+       " '▁or',\n",
+       " '▁she',\n",
+       " 'ill',\n",
+       " '▁co',\n",
+       " '▁year',\n",
+       " 'ad',\n",
+       " '▁bo',\n",
+       " 'te',\n",
+       " '▁than',\n",
+       " '▁r',\n",
+       " 'll',\n",
+       " 'able',\n",
+       " 'ous',\n",
+       " '▁what',\n",
+       " '▁out',\n",
+       " 'ck',\n",
+       " '!',\n",
+       " '▁new',\n",
+       " '▁dis',\n",
+       " 'lo',\n",
+       " 'ra',\n",
+       " 'uch',\n",
+       " '▁him',\n",
+       " 'age',\n",
+       " 'ct',\n",
+       " '▁go',\n",
+       " '▁con',\n",
+       " 'day',\n",
+       " 'z',\n",
+       " 'op',\n",
+       " 'ther',\n",
+       " '▁you',\n",
+       " '▁dr',\n",
+       " 'us',\n",
+       " '▁1',\n",
+       " '▁may',\n",
+       " 'ers',\n",
+       " ':',\n",
+       " '▁first',\n",
+       " 'li',\n",
+       " '0',\n",
+       " 'end',\n",
+       " 'ng',\n",
+       " '▁people',\n",
+       " '▁some',\n",
+       " 'our',\n",
+       " '▁anglesey',\n",
+       " '▁two',\n",
+       " '▁wh',\n",
+       " 'tion',\n",
+       " '▁lo',\n",
+       " '▁sir',\n",
+       " 'ig',\n",
+       " 'ain',\n",
+       " 'tic',\n",
+       " '▁now',\n",
+       " 'ive',\n",
+       " 'and',\n",
+       " '▁whe',\n",
+       " ';',\n",
+       " 'ry',\n",
+       " 'um',\n",
+       " '▁other',\n",
+       " '▁su',\n",
+       " '▁ho',\n",
+       " '▁ha',\n",
+       " '▁ra',\n",
+       " '▁even',\n",
+       " 'ish',\n",
+       " 'ard',\n",
+       " 'wn',\n",
+       " '▁li',\n",
+       " 'ten',\n",
+       " 'ok',\n",
+       " '▁k',\n",
+       " 'ver',\n",
+       " 'into',\n",
+       " '▁na',\n",
+       " '▁vi',\n",
+       " '▁po',\n",
+       " '▁common',\n",
+       " 'ight',\n",
+       " '▁over',\n",
+       " '▁like',\n",
+       " '▁mad',\n",
+       " 'ud',\n",
+       " 'ow',\n",
+       " '▁any',\n",
+       " '▁again',\n",
+       " '▁great',\n",
+       " '▁last',\n",
+       " 'ough',\n",
+       " 'ose',\n",
+       " 'ant',\n",
+       " '▁under',\n",
+       " 'ful',\n",
+       " 'self',\n",
+       " '▁how',\n",
+       " '▁sai',\n",
+       " 'x',\n",
+       " '▁most',\n",
+       " 'ine',\n",
+       " '▁give',\n",
+       " '▁about',\n",
+       " 'ance',\n",
+       " 'me',\n",
+       " 'fore',\n",
+       " 'cu',\n",
+       " '(',\n",
+       " '▁minister',\n",
+       " '▁labour',\n",
+       " '▁talk',\n",
+       " '▁work',\n",
+       " '▁take',\n",
+       " '▁writ',\n",
+       " '▁whil',\n",
+       " '▁well',\n",
+       " '▁plan',\n",
+       " '▁la',\n",
+       " '▁pri',\n",
+       " '▁play',\n",
+       " '▁house',\n",
+       " 'mber',\n",
+       " '▁ex',\n",
+       " '▁good',\n",
+       " 'ern',\n",
+       " 'ple',\n",
+       " 'ble',\n",
+       " '▁german',\n",
+       " 'ak',\n",
+       " 'ever',\n",
+       " '▁time',\n",
+       " '▁pro',\n",
+       " '▁ga',\n",
+       " '▁germany',\n",
+       " '▁britain',\n",
+       " '▁should',\n",
+       " '▁polic',\n",
+       " '▁found',\n",
+       " '▁part',\n",
+       " 'ood',\n",
+       " '▁g',\n",
+       " '▁west',\n",
+       " 'thing',\n",
+       " '▁cur',\n",
+       " '▁know',\n",
+       " '▁back',\n",
+       " '▁ma',\n",
+       " '▁show',\n",
+       " ')',\n",
+       " 'ure',\n",
+       " 'ian',\n",
+       " 'app',\n",
+       " '▁ever',\n",
+       " '▁use',\n",
+       " 'ally',\n",
+       " 'led',\n",
+       " '▁miss',\n",
+       " '▁car',\n",
+       " '▁nation',\n",
+       " 'ttle',\n",
+       " '▁end',\n",
+       " '▁gr',\n",
+       " '▁must',\n",
+       " '▁tr',\n",
+       " '▁du',\n",
+       " '▁get',\n",
+       " '▁unit',\n",
+       " 'less',\n",
+       " '▁agree',\n",
+       " 'ie',\n",
+       " 'tom',\n",
+       " '..',\n",
+       " '▁di',\n",
+       " '▁men',\n",
+       " '4',\n",
+       " '▁my',\n",
+       " '▁today',\n",
+       " 'ves',\n",
+       " 'hap',\n",
+       " 'tory',\n",
+       " 'ence',\n",
+       " 'out',\n",
+       " '▁its',\n",
+       " '8',\n",
+       " '▁believe',\n",
+       " '▁say',\n",
+       " '▁make',\n",
+       " 'for',\n",
+       " '▁them',\n",
+       " '▁president',\n",
+       " '▁return',\n",
+       " '▁british',\n",
+       " '▁after',\n",
+       " '▁earl',\n",
+       " '▁here',\n",
+       " 'ton',\n",
+       " 'ext',\n",
+       " '▁life',\n",
+       " '▁put',\n",
+       " '▁war',\n",
+       " '▁2',\n",
+       " '▁cont',\n",
+       " '▁every',\n",
+       " '▁god',\n",
+       " '▁strong',\n",
+       " 'ren',\n",
+       " 'ty',\n",
+       " '▁mon',\n",
+       " 'man',\n",
+       " 'very',\n",
+       " '5',\n",
+       " '6',\n",
+       " '▁yester',\n",
+       " '▁week',\n",
+       " '▁film',\n",
+       " 'ual',\n",
+       " '▁need',\n",
+       " 'lso',\n",
+       " 'aid',\n",
+       " 'law',\n",
+       " 'ward',\n",
+       " '▁ask',\n",
+       " 'bo',\n",
+       " '▁pre',\n",
+       " '▁conference',\n",
+       " '▁real',\n",
+       " '▁roy',\n",
+       " 'ince',\n",
+       " '▁us',\n",
+       " 'ke',\n",
+       " '▁franc',\n",
+       " '▁direct',\n",
+       " '▁child',\n",
+       " 'cause',\n",
+       " '▁long',\n",
+       " '▁action',\n",
+       " '▁hope',\n",
+       " '▁live',\n",
+       " '▁see',\n",
+       " '▁point',\n",
+       " '▁char',\n",
+       " '▁way',\n",
+       " 'val',\n",
+       " '▁add',\n",
+       " '▁ba',\n",
+       " 'tt',\n",
+       " 'qui',\n",
+       " 'ism',\n",
+       " 'other',\n",
+       " '▁character',\n",
+       " '&',\n",
+       " '▁kennedy',\n",
+       " '▁macmillan',\n",
+       " '▁service',\n",
+       " '▁open',\n",
+       " '▁tri',\n",
+       " '▁country',\n",
+       " 'orn',\n",
+       " 'abl',\n",
+       " '▁off',\n",
+       " '▁fac',\n",
+       " '▁effective',\n",
+       " 'rote',\n",
+       " '▁let',\n",
+       " 'each',\n",
+       " '▁cent',\n",
+       " '▁party',\n",
+       " '▁act',\n",
+       " '▁prim',\n",
+       " '▁str',\n",
+       " 'amp',\n",
+       " '▁3',\n",
+       " '▁describe',\n",
+       " '▁europe',\n",
+       " '▁through',\n",
+       " '▁medic',\n",
+       " '▁home',\n",
+       " 'ug',\n",
+       " '▁african',\n",
+       " 'ind',\n",
+       " '▁prop',\n",
+       " 'by',\n",
+       " '▁thre',\n",
+       " 'ndon',\n",
+       " 'ative',\n",
+       " '▁sit',\n",
+       " ',000',\n",
+       " '▁high',\n",
+       " '▁secret',\n",
+       " '▁night',\n",
+       " '▁fr',\n",
+       " 'health',\n",
+       " '▁call',\n",
+       " '▁north',\n",
+       " '▁mean',\n",
+       " '▁seem',\n",
+       " '▁lea',\n",
+       " 'tire',\n",
+       " 'lway',\n",
+       " 'ness',\n",
+       " '▁rhodesia',\n",
+       " '▁195',\n",
+       " '▁hand',\n",
+       " '▁federa',\n",
+       " '▁place',\n",
+       " 'round',\n",
+       " '▁far',\n",
+       " 'ship',\n",
+       " 'low',\n",
+       " '▁exa',\n",
+       " 'ip',\n",
+       " '▁change',\n",
+       " '▁patient',\n",
+       " '▁lord',\n",
+       " '▁arriv',\n",
+       " '▁trem',\n",
+       " '▁pear',\n",
+       " 'ring',\n",
+       " '▁near',\n",
+       " 'land',\n",
+       " 'ress',\n",
+       " '▁clo',\n",
+       " '▁did',\n",
+       " 'later',\n",
+       " 'ius',\n",
+       " 'tho',\n",
+       " 'well',\n",
+       " '7',\n",
+       " '▁enjoy',\n",
+       " '▁success',\n",
+       " '▁appear',\n",
+       " '▁personal',\n",
+       " '▁move',\n",
+       " '▁treat',\n",
+       " '▁better',\n",
+       " 'case',\n",
+       " '▁never',\n",
+       " '▁look',\n",
+       " 'ture',\n",
+       " 'way',\n",
+       " 'mark',\n",
+       " 'ought',\n",
+       " 'tu',\n",
+       " '▁hu',\n",
+       " 'twe',\n",
+       " '▁foreign',\n",
+       " '▁number',\n",
+       " '▁fall',\n",
+       " '▁month',\n",
+       " '▁young',\n",
+       " 'ound',\n",
+       " '▁find',\n",
+       " '▁american',\n",
+       " '▁thought',\n",
+       " '▁state',\n",
+       " 'ilit',\n",
+       " '▁cit',\n",
+       " '▁side',\n",
+       " '▁centr',\n",
+       " 'lace',\n",
+       " 'ash',\n",
+       " '▁old',\n",
+       " 'rome',\n",
+       " '▁subject',\n",
+       " '▁support',\n",
+       " '▁world',\n",
+       " '▁word',\n",
+       " 'different',\n",
+       " '▁report',\n",
+       " '1',\n",
+       " '▁tru',\n",
+       " '▁sent',\n",
+       " '▁best',\n",
+       " '▁symp',\n",
+       " '▁just',\n",
+       " '▁richards',\n",
+       " '▁produc',\n",
+       " '▁left',\n",
+       " '▁clear',\n",
+       " '▁power',\n",
+       " 'ber',\n",
+       " '▁few',\n",
+       " 'solut',\n",
+       " 'tack',\n",
+       " '▁right',\n",
+       " '▁mar',\n",
+       " '▁might',\n",
+       " '3',\n",
+       " 'not',\n",
+       " '▁full',\n",
+       " '▁union',\n",
+       " '▁base',\n",
+       " '▁head',\n",
+       " 'ealth',\n",
+       " 'many',\n",
+       " '▁happen',\n",
+       " '▁mauro',\n",
+       " 'vision',\n",
+       " '▁inter',\n",
+       " '▁ye',\n",
+       " 'where',\n",
+       " '▁sail',\n",
+       " '▁past',\n",
+       " '▁your',\n",
+       " '▁want',\n",
+       " '▁stor',\n",
+       " 'went',\n",
+       " '▁eas',\n",
+       " '▁rem',\n",
+       " '▁nuclear',\n",
+       " '▁small',\n",
+       " '▁discuss',\n",
+       " 'spect',\n",
+       " '▁girl',\n",
+       " '▁view',\n",
+       " '▁south',\n",
+       " '▁mod',\n",
+       " 'elect',\n",
+       " '▁usual',\n",
+       " 'evi',\n",
+       " 'peration',\n",
+       " '▁engl',\n",
+       " 'train',\n",
+       " '▁mind',\n",
+       " 'acle',\n",
+       " 'riv',\n",
+       " 'side',\n",
+       " 'aris',\n",
+       " '▁event',\n",
+       " '▁lei',\n",
+       " '▁favour',\n",
+       " '▁problem',\n",
+       " '▁large',\n",
+       " 'ordina',\n",
+       " '▁feel',\n",
+       " 'search',\n",
+       " '▁wife',\n",
+       " '▁remain',\n",
+       " 'ready',\n",
+       " 'stone',\n",
+       " '▁cla',\n",
+       " '▁stand',\n",
+       " '▁big',\n",
+       " '▁bus',\n",
+       " 'ox',\n",
+       " '▁four',\n",
+       " 'port',\n",
+       " '▁main',\n",
+       " '▁nor',\n",
+       " '▁va',\n",
+       " 'mann',\n",
+       " '▁pot',\n",
+       " 'rry',\n",
+       " '▁fan',\n",
+       " '▁fig',\n",
+       " '▁attempt',\n",
+       " '▁follow',\n",
+       " '▁particular',\n",
+       " '▁church',\n",
+       " '▁wonder',\n",
+       " '▁doctor',\n",
+       " '▁control',\n",
+       " '▁half',\n",
+       " '▁politic',\n",
+       " 'ciple',\n",
+       " '▁trans',\n",
+       " '▁extra',\n",
+       " '▁start',\n",
+       " '▁death',\n",
+       " '▁tell',\n",
+       " '▁eye',\n",
+       " '▁own',\n",
+       " '▁allow',\n",
+       " '▁offer',\n",
+       " '▁quit',\n",
+       " '▁easi',\n",
+       " 'ular',\n",
+       " '▁che',\n",
+       " '▁cri',\n",
+       " 'bit',\n",
+       " '▁bar',\n",
+       " 'ption',\n",
+       " 'rate',\n",
+       " '▁system',\n",
+       " 'crochet',\n",
+       " '▁certain',\n",
+       " '▁taste',\n",
+       " '▁method',\n",
+       " '▁course',\n",
+       " '▁leader',\n",
+       " 'million',\n",
+       " '▁19',\n",
+       " '▁fu',\n",
+       " '▁help',\n",
+       " 'lady',\n",
+       " 'super',\n",
+       " 'date',\n",
+       " '▁fact',\n",
+       " '▁amo',\n",
+       " '▁sho',\n",
+       " 'nson',\n",
+       " 'lin',\n",
+       " '2',\n",
+       " '▁advi',\n",
+       " 'mitt',\n",
+       " '▁line',\n",
+       " 'effect',\n",
+       " '▁school',\n",
+       " '▁doubt',\n",
+       " '▁consult',\n",
+       " '▁answer',\n",
+       " 'special',\n",
+       " '▁burde',\n",
+       " '▁room',\n",
+       " '▁probabl',\n",
+       " '▁delight',\n",
+       " '▁bank',\n",
+       " '▁six',\n",
+       " '▁brown',\n",
+       " '▁studie',\n",
+       " '▁human',\n",
+       " '▁committee',\n",
+       " 'cial',\n",
+       " '▁creat',\n",
+       " '▁distin',\n",
+       " '▁cut',\n",
+       " '▁mother',\n",
+       " 'morr',\n",
+       " 'ground',\n",
+       " '▁free',\n",
+       " '▁five',\n",
+       " '▁arm',\n",
+       " '▁pol',\n",
+       " 'done',\n",
+       " '▁late',\n",
+       " '▁basi',\n",
+       " 'nato',\n",
+       " '9',\n",
+       " '▁econom',\n",
+       " '▁opposit',\n",
+       " '▁public',\n",
+       " '▁chief',\n",
+       " '▁become',\n",
+       " '▁record',\n",
+       " '▁peace',\n",
+       " '▁consider',\n",
+       " '▁understand',\n",
+       " '▁numer',\n",
+       " '▁slight',\n",
+       " '▁turn',\n",
+       " '▁duke',\n",
+       " '▁homo',\n",
+       " '▁grow',\n",
+       " '▁grant',\n",
+       " '▁office',\n",
+       " 'msel',\n",
+       " 'ainful',\n",
+       " '▁famil',\n",
+       " '▁ob',\n",
+       " '▁advise',\n",
+       " '▁cr',\n",
+       " '▁love',\n",
+       " 'ches',\n",
+       " '▁tea',\n",
+       " 'tist',\n",
+       " 'ivate',\n",
+       " '▁america',\n",
+       " 'aff',\n",
+       " 'author',\n",
+       " '▁gaitskell',\n",
+       " '▁industry',\n",
+       " '▁john',\n",
+       " '▁mistake',\n",
+       " '▁pleasant',\n",
+       " '▁temperature',\n",
+       " '▁difficult',\n",
+       " '▁brief',\n",
+       " '▁defence',\n",
+       " '▁gentle',\n",
+       " '▁piece',\n",
+       " '▁argu',\n",
+       " '▁irrita',\n",
+       " '▁berlin',\n",
+       " '▁father',\n",
+       " '▁cult',\n",
+       " '▁despa',\n",
+       " '▁market',\n",
+       " '▁white',\n",
+       " '▁negotiation',\n",
+       " '▁energ',\n",
+       " '▁fear',\n",
+       " '▁english',\n",
+       " '▁tele',\n",
+       " '▁interest',\n",
+       " 'same',\n",
+       " 'body',\n",
+       " '▁chap',\n",
+       " '▁table',\n",
+       " '▁mass',\n",
+       " '▁press',\n",
+       " '▁spi',\n",
+       " 'lay',\n",
+       " 'hall',\n",
+       " 'most',\n",
+       " '▁journ',\n",
+       " '▁diploma',\n",
+       " '▁expect',\n",
+       " '▁1960',\n",
+       " '▁join',\n",
+       " '▁provide',\n",
+       " '▁russell',\n",
+       " '▁speak',\n",
+       " '▁decide',\n",
+       " '▁complete',\n",
+       " '▁senator',\n",
+       " '▁tax',\n",
+       " '▁earth',\n",
+       " '▁colour',\n",
+       " 'armament',\n",
+       " '▁sevent',\n",
+       " '▁learn',\n",
+       " '▁honey',\n",
+       " '▁militar',\n",
+       " '▁claim',\n",
+       " '▁chamb',\n",
+       " '▁master',\n",
+       " '▁peel',\n",
+       " '▁firm',\n",
+       " 'eason',\n",
+       " '▁trade',\n",
+       " '▁washing',\n",
+       " '▁statement',\n",
+       " 'range',\n",
+       " '▁rol',\n",
+       " 'guis',\n",
+       " 'hei',\n",
+       " 'ago',\n",
+       " '▁pla',\n",
+       " '▁cam',\n",
+       " 'eopa',\n",
+       " '▁africa',\n",
+       " '▁convention',\n",
+       " '▁council',\n",
+       " '▁decision',\n",
+       " '▁experience',\n",
+       " '▁increasing',\n",
+       " '▁majorit',\n",
+       " '▁suffer',\n",
+       " '▁suggest',\n",
+       " '▁position',\n",
+       " '▁resource',\n",
+       " '▁soldier',\n",
+       " 'temper',\n",
+       " '▁general',\n",
+       " '▁financi',\n",
+       " '▁express',\n",
+       " '▁short',\n",
+       " '▁compan',\n",
+       " 'field',\n",
+       " '▁think',\n",
+       " 'cross',\n",
+       " '▁odd',\n",
+       " '▁1830',\n",
+       " '▁reason',\n",
+       " '▁hard',\n",
+       " '▁fright',\n",
+       " '▁final',\n",
+       " '▁thread',\n",
+       " 'heart',\n",
+       " '▁nea',\n",
+       " '▁assu',\n",
+       " '▁step',\n",
+       " '▁countr',\n",
+       " 'did',\n",
+       " '▁infa',\n",
+       " 'struct',\n",
+       " '▁question',\n",
+       " 'advantage',\n",
+       " '▁atmospher',\n",
+       " '▁screw',\n",
+       " '▁square',\n",
+       " '▁welensky',\n",
+       " '▁cinema',\n",
+       " '▁weapon',\n",
+       " '▁importan',\n",
+       " '▁apart',\n",
+       " '▁prepar',\n",
+       " '▁payment',\n",
+       " '▁present',\n",
+       " '▁persever',\n",
+       " 'simpl',\n",
+       " '▁idea',\n",
+       " '▁expl',\n",
+       " '▁chance',\n",
+       " '▁fast',\n",
+       " '▁notice',\n",
+       " 'rade',\n",
+       " '▁giv',\n",
+       " '▁leave',\n",
+       " 'finger',\n",
+       " 'eppe',\n",
+       " '▁gla',\n",
+       " '▁psycholog',\n",
+       " '▁robertson',\n",
+       " '▁accept',\n",
+       " '▁improv',\n",
+       " '▁philip',\n",
+       " '▁technical',\n",
+       " '▁trouble',\n",
+       " '▁premier',\n",
+       " '▁progress',\n",
+       " 'minute',\n",
+       " '▁scot',\n",
+       " '▁couple',\n",
+       " '▁concern',\n",
+       " '▁schem',\n",
+       " '▁weaver',\n",
+       " '▁inform',\n",
+       " '▁cap',\n",
+       " 'stead',\n",
+       " '▁heav',\n",
+       " '▁propos',\n",
+       " '▁order',\n",
+       " 'ensure',\n",
+       " 'worth',\n",
+       " '▁former',\n",
+       " '▁sense',\n",
+       " '▁clu',\n",
+       " 'nder',\n",
+       " 'easing',\n",
+       " '▁possib',\n",
+       " 'ulate',\n",
+       " 'polit',\n",
+       " 'equa',\n",
+       " 'communist',\n",
+       " '▁adenauer',\n",
+       " '▁delaney',\n",
+       " '▁demonstrat',\n",
+       " '▁independen',\n",
+       " '▁welcome',\n",
+       " '▁speech',\n",
+       " '▁includ',\n",
+       " '▁oppose',\n",
+       " 'timate',\n",
+       " '▁picture',\n",
+       " '▁share',\n",
+       " '▁result',\n",
+       " '▁receiv',\n",
+       " '▁garden',\n",
+       " '▁appeal',\n",
+       " '▁chair',\n",
+       " '▁keep',\n",
+       " '▁regard',\n",
+       " '▁chines',\n",
+       " '▁sister',\n",
+       " '▁prince',\n",
+       " '▁scene',\n",
+       " '▁spend',\n",
+       " '▁congo',\n",
+       " '▁issue',\n",
+       " '▁authorit',\n",
+       " '▁memor',\n",
+       " '▁scar',\n",
+       " '▁moral',\n",
+       " '▁matter',\n",
+       " 'aught',\n",
+       " '▁wait',\n",
+       " 'street',\n",
+       " '▁concept',\n",
+       " '▁destructi',\n",
+       " 'sol',\n",
+       " 'committe',\n",
+       " 'house',\n",
+       " 'older',\n",
+       " 'ference',\n",
+       " '▁adjust',\n",
+       " '▁associat',\n",
+       " '▁campaign',\n",
+       " '▁conservative',\n",
+       " '▁immediate',\n",
+       " '▁industri',\n",
+       " '▁james',\n",
+       " '▁lucas',\n",
+       " '▁practic',\n",
+       " '▁publish',\n",
+       " '▁purpose',\n",
+       " '▁queer',\n",
+       " '▁representative',\n",
+       " '▁relief',\n",
+       " '▁detail',\n",
+       " '▁official',\n",
+       " '▁faith',\n",
+       " '▁design',\n",
+       " '▁sacr',\n",
+       " '▁household',\n",
+       " '▁variet',\n",
+       " '▁village',\n",
+       " '▁foot',\n",
+       " '▁russia',\n",
+       " '▁fifty',\n",
+       " '▁block',\n",
+       " '▁inevitab',\n",
+       " '▁prevent',\n",
+       " '▁belgi',\n",
+       " '▁type',\n",
+       " '▁cheap',\n",
+       " '▁protest',\n",
+       " '▁along',\n",
+       " '▁demand',\n",
+       " '▁earn',\n",
+       " '▁sick',\n",
+       " '▁eviden',\n",
+       " '▁someone',\n",
+       " '▁peers',\n",
+       " '▁suitabl',\n",
+       " '▁sympath',\n",
+       " 'brid',\n",
+       " '▁discr',\n",
+       " 'flu',\n",
+       " 'bomb',\n",
+       " 'rator',\n",
+       " 'coming',\n",
+       " 'famili',\n",
+       " '▁183',\n",
+       " 'employ',\n",
+       " '▁catherine',\n",
+       " '▁correspond',\n",
+       " '▁material',\n",
+       " '▁rubb',\n",
+       " '▁measure',\n",
+       " '▁sharp',\n",
+       " '▁wicki',\n",
+       " '▁marriage',\n",
+       " '▁size',\n",
+       " '▁liberal',\n",
+       " '▁level',\n",
+       " '▁surround',\n",
+       " '▁delegat',\n",
+       " 'script',\n",
+       " '▁suppl',\n",
+       " '▁reserve',\n",
+       " '▁refus',\n",
+       " '▁gift',\n",
+       " '▁promis',\n",
+       " '▁safe',\n",
+       " '▁exist',\n",
+       " '▁alto',\n",
+       " '▁wide',\n",
+       " '▁arrang',\n",
+       " '▁relations',\n",
+       " 'shab',\n",
+       " '▁intelligen',\n",
+       " 'emphasi',\n",
+       " '▁advertisement',\n",
+       " '▁beautiful',\n",
+       " '+',\n",
+       " '?',\n",
+       " 'q',\n",
+       " 'j',\n",
+       " ...]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processor.tokens"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
@@ -260,6 +1359,1027 @@
     "len(processor.tokens)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "fb600959",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'!': ['▁', '!'],\n",
+       " '\"': ['▁', '\"'],\n",
+       " '&': ['▁', '&'],\n",
+       " \"'\": [\"▁'\"],\n",
+       " \"'30s\": [\"▁'\", '3', '0', 's'],\n",
+       " \"'61\": [\"▁'\", '6', '1'],\n",
+       " \"'d\": [\"▁'\", 'd'],\n",
+       " \"'ll\": [\"▁'\", 'll'],\n",
+       " \"'m\": [\"▁'\", 'm'],\n",
+       " \"'re\": [\"▁'\", 're'],\n",
+       " \"'s\": [\"▁'\", 's'],\n",
+       " \"'ve\": [\"▁'\", 've'],\n",
+       " '(': ['▁', '('],\n",
+       " ')': ['▁', ')'],\n",
+       " '+2.8': ['▁', '+', '2', '.', '8'],\n",
+       " '+3.6': ['▁', '+', '3', '.', '6'],\n",
+       " ',': ['▁', ','],\n",
+       " '-': ['▁-'],\n",
+       " '-2.6': ['▁-', '2', '.', '6'],\n",
+       " '-5.4': ['▁-', '5', '.', '4'],\n",
+       " '.': ['▁.'],\n",
+       " '...': ['▁.', '..'],\n",
+       " '0m': ['▁', '0', 'm'],\n",
+       " '1': ['▁1'],\n",
+       " '1,157': ['▁1', ',', '1', '5', '7'],\n",
+       " '1.8': ['▁1', '.', '8'],\n",
+       " '1/2': ['▁1', '/', '2'],\n",
+       " '1/2-in.-long': ['▁1', '/', '2', '-', 'in', '.', '-', 'lo', 'ng'],\n",
+       " '1/4': ['▁1', '/', '4'],\n",
+       " '10': ['▁1', '0'],\n",
+       " '10,000': ['▁1', '0', ',000'],\n",
+       " '100': ['▁1', '0', '0'],\n",
+       " '100,000,000': ['▁1', '0', '0', ',000', ',000'],\n",
+       " '11': ['▁1', '1'],\n",
+       " '12': ['▁1', '2'],\n",
+       " '125': ['▁1', '2', '5'],\n",
+       " '13': ['▁1', '3'],\n",
+       " '13,000': ['▁1', '3', ',000'],\n",
+       " '14': ['▁1', '4'],\n",
+       " '15': ['▁1', '5'],\n",
+       " '15,000,000': ['▁1', '5', ',000', ',000'],\n",
+       " '15-17': ['▁1', '5', '-', '1', '7'],\n",
+       " '15-year-olds': ['▁1', '5', '-', 'y', 'e', 'ar', '-', 'o', 'ld', 's'],\n",
+       " '150,000,000': ['▁1', '5', '0', ',000', ',000'],\n",
+       " '16': ['▁1', '6'],\n",
+       " '160': ['▁1', '6', '0'],\n",
+       " '163,000,000': ['▁1', '6', '3', ',000', ',000'],\n",
+       " '167': ['▁1', '6', '7'],\n",
+       " '17': ['▁1', '7'],\n",
+       " '18.1': ['▁1', '8', '.', '1'],\n",
+       " '1830': ['▁1830'],\n",
+       " \"1830's\": ['▁1830', \"'\", 's'],\n",
+       " '1834': ['▁183', '4'],\n",
+       " '1897': ['▁1', '8', '9', '7'],\n",
+       " '19': ['▁19'],\n",
+       " '19.5': ['▁19', '.', '5'],\n",
+       " '1910': ['▁19', '1', '0'],\n",
+       " '1913': ['▁19', '1', '3'],\n",
+       " '1914': ['▁19', '1', '4'],\n",
+       " '1914-18': ['▁19', '1', '4', '-', '1', '8'],\n",
+       " '1920': ['▁19', '2', '0'],\n",
+       " '1931': ['▁19', '3', '1'],\n",
+       " '1932': ['▁19', '3', '2'],\n",
+       " '1934': ['▁19', '3', '4'],\n",
+       " '1939': ['▁19', '3', '9'],\n",
+       " '1943': ['▁19', '4', '3'],\n",
+       " '1944': ['▁19', '4', '4'],\n",
+       " '1950': ['▁195', '0'],\n",
+       " '1951': ['▁195', '1'],\n",
+       " '1954': ['▁195', '4'],\n",
+       " '1956': ['▁195', '6'],\n",
+       " '1957': ['▁195', '7'],\n",
+       " '1958': ['▁195', '8'],\n",
+       " '1959': ['▁195', '9'],\n",
+       " '1960': ['▁1960'],\n",
+       " '1960s': ['▁1960', 's'],\n",
+       " '1961': ['▁19', '6', '1'],\n",
+       " '1963': ['▁19', '6', '3'],\n",
+       " '19th': ['▁19', 'th'],\n",
+       " '1superceded': ['▁1', 'super', 'c', 'ed', 'ed'],\n",
+       " \"1tho'\": ['▁1', 'tho', \"'\"],\n",
+       " '2': ['▁2'],\n",
+       " '2,000': ['▁2', ',000'],\n",
+       " '2,415,000,000': ['▁2', ',', '4', '1', '5', ',000', ',000'],\n",
+       " '20': ['▁2', '0'],\n",
+       " '20-month-old': ['▁2', '0', '-', 'm', 'on', 'th', '-', 'o', 'ld'],\n",
+       " '200': ['▁2', '0', '0'],\n",
+       " '20th-century': ['▁2', '0', 'th', '-', 'c', 'ent', 'ur', 'y'],\n",
+       " '210million': ['▁2', '1', '0', 'million'],\n",
+       " '22': ['▁2', '2'],\n",
+       " '23.1': ['▁2', '3', '.', '1'],\n",
+       " '24-strong': ['▁2', '4', '-', 'st', 'ro', 'ng'],\n",
+       " '25': ['▁2', '5'],\n",
+       " '27': ['▁2', '7'],\n",
+       " '28.5': ['▁2', '8', '.', '5'],\n",
+       " '280,000': ['▁2', '8', '0', ',000'],\n",
+       " '287': ['▁2', '8', '7'],\n",
+       " '288': ['▁2', '8', '8'],\n",
+       " '2bhoys': ['▁2', 'b', 'h', 'o', 'y', 's'],\n",
+       " '2ole': ['▁2', 'o', 'le'],\n",
+       " '2skint': ['▁2', 's', 'k', 'in', 't'],\n",
+       " '3': ['▁3'],\n",
+       " '3,000': ['▁3', ',000'],\n",
+       " '3.6': ['▁3', '.', '6'],\n",
+       " '3/0': ['▁3', '/', '0'],\n",
+       " '30': ['▁3', '0'],\n",
+       " '30-minute': ['▁3', '0', '-', 'minute'],\n",
+       " '300,000': ['▁3', '0', '0', ',000'],\n",
+       " '32': ['▁3', '2'],\n",
+       " '34': ['▁3', '4'],\n",
+       " '35': ['▁3', '5'],\n",
+       " '357million': ['▁3', '5', '7', 'million'],\n",
+       " '36': ['▁3', '6'],\n",
+       " '37,000,000': ['▁3', '7', ',000', ',000'],\n",
+       " '37.2': ['▁3', '7', '.', '2'],\n",
+       " '38': ['▁3', '8'],\n",
+       " '4': ['▁', '4'],\n",
+       " '4.8': ['▁', '4', '.', '8'],\n",
+       " '40': ['▁', '4', '0'],\n",
+       " '400': ['▁', '4', '0', '0'],\n",
+       " '400,000': ['▁', '4', '0', '0', ',000'],\n",
+       " '43': ['▁', '4', '3'],\n",
+       " '450': ['▁', '4', '5', '0'],\n",
+       " '5': ['▁', '5'],\n",
+       " '5,000': ['▁', '5', ',000'],\n",
+       " '5.30': ['▁', '5', '.', '3', '0'],\n",
+       " '5/8': ['▁', '5', '/', '8'],\n",
+       " '50,000': ['▁', '5', '0', ',000'],\n",
+       " '500': ['▁', '5', '0', '0'],\n",
+       " '550,000': ['▁', '5', '5', '0', ',000'],\n",
+       " '58': ['▁', '5', '8'],\n",
+       " '6': ['▁', '6'],\n",
+       " '6,000': ['▁', '6', ',000'],\n",
+       " '60': ['▁', '6', '0'],\n",
+       " '600': ['▁', '6', '0', '0'],\n",
+       " '61-year-old': ['▁', '6', '1', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n",
+       " '6al': ['▁', '6', 'al'],\n",
+       " '6tic': ['▁', '6', 'tic'],\n",
+       " '7.42': ['▁', '7', '.', '4', '2'],\n",
+       " '70': ['▁', '7', '0'],\n",
+       " '70,000,000': ['▁', '7', '0', ',000', ',000'],\n",
+       " '707': ['▁', '7', '0', '7'],\n",
+       " '750': ['▁', '7', '5', '0'],\n",
+       " '8': ['▁', '8'],\n",
+       " '80': ['▁', '8', '0'],\n",
+       " '800': ['▁', '8', '0', '0'],\n",
+       " '800,000': ['▁', '8', '0', '0', ',000'],\n",
+       " '86': ['▁', '8', '6'],\n",
+       " '88': ['▁', '8', '8'],\n",
+       " '88-year-old': ['▁', '8', '8', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n",
+       " '89': ['▁', '8', '9'],\n",
+       " '89-year-old': ['▁', '8', '9', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n",
+       " '90-day': ['▁', '9', '0', '-', 'day'],\n",
+       " '90-minute': ['▁', '9', '0', '-', 'minute'],\n",
+       " '91': ['▁', '9', '1'],\n",
+       " '950': ['▁', '9', '5', '0'],\n",
+       " '97.5': ['▁', '9', '7', '.', '5'],\n",
+       " ':': ['▁', ':'],\n",
+       " ';': ['▁', ';'],\n",
+       " '?': ['▁', '?'],\n",
+       " 'a': ['▁a'],\n",
+       " 'abandon': ['▁a', 'b', 'a', 'ndon'],\n",
+       " 'abandoned': ['▁a', 'b', 'a', 'ndon', 'ed'],\n",
+       " 'abashed': ['▁a', 'b', 'ash', 'ed'],\n",
+       " 'ability': ['▁a', 'b', 'ilit', 'y'],\n",
+       " 'able': ['▁', 'able'],\n",
+       " 'able-bodied': ['▁', 'able', '-', 'bo', 'd', 'i', 'ed'],\n",
+       " 'abolish': ['▁a', 'bo', 'l', 'ish'],\n",
+       " 'abolished': ['▁a', 'bo', 'l', 'ish', 'ed'],\n",
+       " 'abolition': ['▁a', 'bo', 'li', 'tion'],\n",
+       " 'abou': ['▁a', 'bo', 'u'],\n",
+       " 'about': ['▁about'],\n",
+       " 'about-': ['▁about', '-'],\n",
+       " 'above': ['▁a', 'bo', 've'],\n",
+       " 'abreast': ['▁a', 'b', 're', 'a', 'st'],\n",
+       " 'abroad': ['▁a', 'b', 'ro', 'ad'],\n",
+       " 'absence': ['▁a', 'b', 's', 'ence'],\n",
+       " 'absent': ['▁a', 'b', 's', 'ent'],\n",
+       " 'absolutely': ['▁a', 'b', 'solut', 'e', 'ly'],\n",
+       " 'abstraction': ['▁a', 'b', 'st', 'ra', 'c', 'tion'],\n",
+       " 'abundance': ['▁a', 'b', 'un', 'd', 'ance'],\n",
+       " 'ac-': ['▁a', 'c', '-'],\n",
+       " 'academic': ['▁a', 'c', 'ad', 'e', 'm', 'ic'],\n",
+       " 'accent': ['▁a', 'c', 'c', 'ent'],\n",
+       " 'accents': ['▁a', 'c', 'c', 'ent', 's'],\n",
+       " 'accept': ['▁accept'],\n",
+       " 'acceptable': ['▁accept', 'able'],\n",
+       " 'accepted': ['▁accept', 'ed'],\n",
+       " 'accepting': ['▁accept', 'ing'],\n",
+       " 'accessories': ['▁a', 'c', 'ce', 's', 's', 'or', 'ies'],\n",
+       " 'accident': ['▁a', 'c', 'c', 'i', 'd', 'ent'],\n",
+       " 'accidental': ['▁a', 'c', 'c', 'i', 'd', 'ent', 'al'],\n",
+       " 'accommodate': ['▁a', 'c', 'c', 'o', 'm', 'm', 'o', 'date'],\n",
+       " 'accommodation': ['▁a', 'c', 'c', 'o', 'm', 'm', 'o', 'd', 'ation'],\n",
+       " 'accompanied': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'i', 'ed'],\n",
+       " 'accompanist': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'is', 't'],\n",
+       " 'accompany': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'y'],\n",
+       " 'accomplished': ['▁a', 'c', 'c', 'o', 'm', 'p', 'l', 'ish', 'ed'],\n",
+       " 'accomplishments': ['▁a', 'c', 'c', 'o', 'm', 'p', 'l', 'ish', 'ment', 's'],\n",
+       " 'according': ['▁a', 'c', 'c', 'or', 'd', 'ing'],\n",
+       " 'account': ['▁a', 'c', 'c', 'o', 'un', 't'],\n",
+       " 'accountancy': ['▁a', 'c', 'c', 'o', 'un', 't', 'an', 'c', 'y'],\n",
+       " 'accra': ['▁a', 'c', 'c', 'ra'],\n",
+       " \"accra's\": ['▁a', 'c', 'c', 'ra', \"'\", 's'],\n",
+       " 'accuracy': ['▁a', 'c', 'cu', 'ra', 'c', 'y'],\n",
+       " 'accurate': ['▁a', 'c', 'cu', 'rate'],\n",
+       " 'accurately': ['▁a', 'c', 'cu', 'rate', 'ly'],\n",
+       " 'accused': ['▁a', 'c', 'cu', 's', 'ed'],\n",
+       " 'achieved': ['▁a', 'ch', 'ie', 'v', 'ed'],\n",
+       " 'achievement': ['▁a', 'ch', 'ie', 've', 'ment'],\n",
+       " 'acquaintance': ['▁a', 'c', 'q', 'u', 'ain', 't', 'ance'],\n",
+       " 'acquaintances': ['▁a', 'c', 'q', 'u', 'ain', 't', 'ance', 's'],\n",
+       " 'acres': ['▁a', 'c', 're', 's'],\n",
+       " 'across': ['▁a', 'cross'],\n",
+       " 'act': ['▁act'],\n",
+       " 'acting': ['▁act', 'ing'],\n",
+       " 'action': ['▁action'],\n",
+       " 'actions': ['▁action', 's'],\n",
+       " 'active': ['▁act', 'ive'],\n",
+       " 'activists': ['▁act', 'i', 'v', 'is', 't', 's'],\n",
+       " 'activities': ['▁act', 'i', 'v', 'it', 'ies'],\n",
+       " 'activity': ['▁act', 'i', 'v', 'it', 'y'],\n",
+       " 'acton': ['▁act', 'on'],\n",
+       " 'actor': ['▁act', 'or'],\n",
+       " 'actress': ['▁act', 'ress'],\n",
+       " 'acts': ['▁act', 's'],\n",
+       " 'actual': ['▁act', 'ual'],\n",
+       " 'actually': ['▁act', 'ual', 'ly'],\n",
+       " 'adamafio': ['▁a', 'd', 'a', 'm', 'a', 'f', 'i', 'o'],\n",
+       " 'adaptation': ['▁a', 'd', 'a', 'p', 't', 'ation'],\n",
+       " 'adapted': ['▁a', 'd', 'a', 'p', 't', 'ed'],\n",
+       " 'adapting': ['▁a', 'd', 'a', 'p', 't', 'ing'],\n",
+       " 'add': ['▁add'],\n",
+       " 'added': ['▁add', 'ed'],\n",
+       " 'adding': ['▁add', 'ing'],\n",
+       " 'addition': ['▁add', 'it', 'ion'],\n",
+       " 'additions': ['▁add', 'it', 'ion', 's'],\n",
+       " 'address': ['▁add', 'ress'],\n",
+       " 'addressed': ['▁add', 'ress', 'ed'],\n",
+       " 'addresses': ['▁add', 'ress', 'es'],\n",
+       " 'addressing': ['▁add', 'ress', 'ing'],\n",
+       " 'adenauer': ['▁adenauer'],\n",
+       " \"adenauer's\": ['▁adenauer', \"'\", 's'],\n",
+       " 'adequate': ['▁a', 'd', 'equa', 'te'],\n",
+       " 'adhem': ['▁a', 'd', 'h', 'e', 'm'],\n",
+       " 'adjust': ['▁adjust'],\n",
+       " 'adjustment': ['▁adjust', 'ment'],\n",
+       " 'administration': ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ation'],\n",
+       " \"administration's\": ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ation', \"'\", 's'],\n",
+       " 'administrative': ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ative'],\n",
+       " 'admiralty': ['▁a', 'd', 'm', 'ir', 'al', 'ty'],\n",
+       " 'admire': ['▁a', 'd', 'm', 'ir', 'e'],\n",
+       " 'admit': ['▁a', 'd', 'm', 'it'],\n",
+       " 'admitted': ['▁a', 'd', 'mitt', 'ed'],\n",
+       " 'admitting': ['▁a', 'd', 'mitt', 'ing'],\n",
+       " 'adopted': ['▁a', 'd', 'op', 't', 'ed'],\n",
+       " 'adoption': ['▁a', 'd', 'o', 'ption'],\n",
+       " 'adult': ['▁a', 'd', 'ul', 't'],\n",
+       " 'advance': ['▁a', 'd', 'v', 'ance'],\n",
+       " 'advanced': ['▁a', 'd', 'v', 'ance', 'd'],\n",
+       " 'advancing': ['▁a', 'd', 'v', 'an', 'c', 'ing'],\n",
+       " 'advantage': ['▁', 'advantage'],\n",
+       " 'advantages': ['▁', 'advantage', 's'],\n",
+       " 'advertisement': ['▁advertisement'],\n",
+       " 'advertisements': ['▁advertisement', 's'],\n",
+       " 'advice': ['▁advi', 'ce'],\n",
+       " 'advisability': ['▁advi', 's', 'a', 'b', 'ilit', 'y'],\n",
+       " 'advise': ['▁advise'],\n",
+       " 'advised': ['▁advise', 'd'],\n",
+       " 'advisers': ['▁advise', 'r', 's'],\n",
+       " 'advocate': ['▁a', 'd', 'v', 'o', 'c', 'at', 'e'],\n",
+       " 'af-': ['▁a', 'f', '-'],\n",
+       " 'affairs': ['▁', 'aff', 'a', 'ir', 's'],\n",
+       " 'affected': ['▁', 'aff', 'e', 'ct', 'ed'],\n",
+       " 'affection': ['▁', 'aff', 'e', 'c', 'tion'],\n",
+       " 'affilia-': ['▁', 'aff', 'il', 'i', 'a', '-'],\n",
+       " 'affiliations': ['▁', 'aff', 'il', 'i', 'ation', 's'],\n",
+       " 'affluence': ['▁a', 'f', 'flu', 'ence'],\n",
+       " 'affluent': ['▁a', 'f', 'flu', 'ent'],\n",
+       " 'afraid': ['▁a', 'f', 'r', 'aid'],\n",
+       " 'africa': ['▁africa'],\n",
+       " \"africa's\": ['▁africa', \"'\", 's'],\n",
+       " 'african': ['▁african'],\n",
+       " 'africans': ['▁african', 's'],\n",
+       " 'after': ['▁after'],\n",
+       " 'afternoon': ['▁after', 'n', 'o', 'on'],\n",
+       " 'afterwards': ['▁after', 'ward', 's'],\n",
+       " 'again': ['▁again'],\n",
+       " 'against': ['▁again', 'st'],\n",
+       " 'age': ['▁', 'age'],\n",
+       " 'age-structure': ['▁', 'age', '-', 'struct', 'ure'],\n",
+       " 'aged': ['▁a', 'g', 'ed'],\n",
+       " 'ageing': ['▁', 'age', 'ing'],\n",
+       " 'agent': ['▁a', 'g', 'ent'],\n",
+       " 'agents': ['▁a', 'g', 'ent', 's'],\n",
+       " 'ages': ['▁', 'age', 's'],\n",
+       " 'agitation': ['▁a', 'g', 'it', 'ation'],\n",
+       " 'ago': ['▁', 'ago'],\n",
+       " 'agree': ['▁agree'],\n",
+       " 'agreed': ['▁agree', 'd'],\n",
+       " 'agreement': ['▁agree', 'ment'],\n",
+       " 'agreements': ['▁agree', 'ment', 's'],\n",
+       " 'agriculture': ['▁a', 'g', 'ri', 'c', 'ul', 'ture'],\n",
+       " 'ahead': ['▁a', 'h', 'e', 'ad'],\n",
+       " 'aid': ['▁', 'aid'],\n",
+       " 'aide': ['▁', 'aid', 'e'],\n",
+       " 'aided': ['▁', 'aid', 'ed'],\n",
+       " 'aides': ['▁', 'aid', 'es'],\n",
+       " 'aim': ['▁a', 'i', 'm'],\n",
+       " 'aimed': ['▁a', 'i', 'm', 'ed'],\n",
+       " 'aiming': ['▁a', 'i', 'm', 'ing'],\n",
+       " 'air': ['▁a', 'ir'],\n",
+       " 'aircraft': ['▁a', 'ir', 'c', 'ra', 'f', 't'],\n",
+       " 'aired': ['▁a', 'ir', 'ed'],\n",
+       " \"airliner's\": ['▁a', 'ir', 'lin', 'er', \"'\", 's'],\n",
+       " 'airmen': ['▁a', 'ir', 'm', 'en'],\n",
+       " 'airport': ['▁a', 'ir', 'port'],\n",
+       " 'akin': ['▁', 'ak', 'in'],\n",
+       " \"aladdin's\": ['▁', 'al', 'ad', 'd', 'in', \"'\", 's'],\n",
+       " 'alan': ['▁', 'al', 'an'],\n",
+       " 'alarmed': ['▁', 'al', 'ar', 'm', 'ed'],\n",
+       " 'alas': ['▁', 'al', 'a', 's'],\n",
+       " 'alcoholic': ['▁', 'al', 'c', 'o', 'h', 'o', 'l', 'ic'],\n",
+       " 'algeria': ['▁', 'al', 'ge', 'ri', 'a'],\n",
+       " 'alike': ['▁a', 'li', 'ke'],\n",
+       " 'alive': ['▁', 'al', 'ive'],\n",
+       " 'all': ['▁all'],\n",
+       " 'all-regular': ['▁all', '-', 're', 'g', 'ular'],\n",
+       " 'alleged': ['▁all', 'e', 'g', 'ed'],\n",
+       " 'allen': ['▁all', 'en'],\n",
+       " 'alleviation': ['▁all', 'evi', 'ation'],\n",
+       " 'alley': ['▁all', 'e', 'y'],\n",
+       " 'alliance': ['▁all', 'i', 'ance'],\n",
+       " 'alliances': ['▁all', 'i', 'ance', 's'],\n",
+       " 'allied': ['▁all', 'i', 'ed'],\n",
+       " 'allies': ['▁all', 'ies'],\n",
+       " 'allow': ['▁allow'],\n",
+       " 'allowance': ['▁allow', 'ance'],\n",
+       " 'allowances': ['▁allow', 'ance', 's'],\n",
+       " 'allowed': ['▁allow', 'ed'],\n",
+       " 'allowing': ['▁allow', 'ing'],\n",
+       " 'ally': ['▁', 'ally'],\n",
+       " 'almost': ['▁', 'al', 'most'],\n",
+       " 'alone': ['▁a', 'lo', 'ne'],\n",
+       " 'along': ['▁along'],\n",
+       " 'alongside': ['▁along', 'side'],\n",
+       " 'already': ['▁', 'al', 'ready'],\n",
+       " 'also': ['▁a', 'lso'],\n",
+       " 'alter': ['▁', 'al', 'ter'],\n",
+       " 'alternative': ['▁', 'al', 'ter', 'n', 'ative'],\n",
+       " 'alternatively': ['▁', 'al', 'ter', 'n', 'ative', 'ly'],\n",
+       " 'alternatives': ['▁', 'al', 'ter', 'n', 'ative', 's'],\n",
+       " 'although': ['▁', 'al', 'th', 'ough'],\n",
+       " 'altogether': ['▁alto', 'ge', 'ther'],\n",
+       " 'altos': ['▁alto', 's'],\n",
+       " 'always': ['▁a', 'lway', 's'],\n",
+       " 'am': ['▁a', 'm'],\n",
+       " 'amateur': ['▁a', 'm', 'at', 'e', 'ur'],\n",
+       " 'amazed': ['▁a', 'm', 'a', 'z', 'ed'],\n",
+       " 'amazing': ['▁a', 'm', 'a', 'z', 'ing'],\n",
+       " 'ambassador': ['▁a', 'm', 'b', 'a', 's', 's', 'ad', 'or'],\n",
+       " 'amber': ['▁a', 'mber'],\n",
+       " 'ambition': ['▁a', 'm', 'bit', 'ion'],\n",
+       " 'ambitious': ['▁a', 'm', 'bit', 'i', 'ous'],\n",
+       " 'ambulance': ['▁a', 'm', 'b', 'ul', 'ance'],\n",
+       " 'ambulances': ['▁a', 'm', 'b', 'ul', 'ance', 's'],\n",
+       " 'america': ['▁america'],\n",
+       " \"america's\": ['▁america', \"'\", 's'],\n",
+       " 'american': ['▁american'],\n",
+       " 'american-born': ['▁american', '-', 'b', 'orn'],\n",
+       " 'americans': ['▁american', 's'],\n",
+       " 'ammunition': ['▁a', 'm', 'm', 'un', 'it', 'ion'],\n",
+       " 'among': ['▁amo', 'ng'],\n",
+       " 'amount': ['▁amo', 'un', 't'],\n",
+       " 'ample': ['▁a', 'm', 'ple'],\n",
+       " 'amusement': ['▁a', 'm', 'us', 'e', 'ment'],\n",
+       " 'amusing': ['▁a', 'm', 'us', 'ing'],\n",
+       " 'an': ['▁an'],\n",
+       " 'analogy': ['▁an', 'al', 'o', 'g', 'y'],\n",
+       " 'analysed': ['▁an', 'al', 'y', 's', 'ed'],\n",
+       " 'anchor': ['▁an', 'ch', 'or'],\n",
+       " 'ancient': ['▁an', 'c', 'i', 'ent'],\n",
+       " 'and': ['▁and'],\n",
+       " 'andrei': ['▁and', 're', 'i'],\n",
+       " 'andrew': ['▁and', 're', 'w'],\n",
+       " 'anecdotal': ['▁an', 'e', 'c', 'd', 'o', 't', 'al'],\n",
+       " 'angel': ['▁a', 'ng', 'el'],\n",
+       " 'angeles': ['▁an', 'ge', 'le', 's'],\n",
+       " 'angelo': ['▁an', 'ge', 'lo'],\n",
+       " 'anger': ['▁a', 'ng', 'er'],\n",
+       " 'anglais': ['▁a', 'ng', 'l', 'a', 'is'],\n",
+       " 'angle': ['▁a', 'ng', 'le'],\n",
+       " 'anglesey': ['▁anglesey'],\n",
+       " \"anglesey's\": ['▁anglesey', \"'\", 's'],\n",
+       " 'anglesey-road': ['▁anglesey', '-', 'ro', 'ad'],\n",
+       " 'angola': ['▁a', 'ng', 'o', 'l', 'a'],\n",
+       " 'angry': ['▁a', 'ng', 'ry'],\n",
+       " 'ann': ['▁an', 'n'],\n",
+       " 'anna': ['▁an', 'n', 'a'],\n",
+       " 'announced': ['▁an', 'n', 'o', 'un', 'c', 'ed'],\n",
+       " 'announcement': ['▁an', 'n', 'o', 'un', 'ce', 'ment'],\n",
+       " 'announcing': ['▁an', 'n', 'o', 'un', 'c', 'ing'],\n",
+       " 'annual': ['▁an', 'n', 'ual'],\n",
+       " 'another': ['▁an', 'other'],\n",
+       " 'answer': ['▁answer'],\n",
+       " 'answered': ['▁answer', 'ed'],\n",
+       " 'answering': ['▁answer', 'ing'],\n",
+       " 'antagonism': ['▁', 'ant', 'ago', 'n', 'ism'],\n",
+       " 'anthony': ['▁an', 'th', 'on', 'y'],\n",
+       " 'anti-apartheid': ['▁', 'ant', 'i', '-', 'a', 'p', 'ar', 't', 'hei', 'd'],\n",
+       " 'anti-bomb': ['▁', 'ant', 'i', '-', 'bomb'],\n",
+       " 'anti-german': ['▁', 'ant', 'i', '-', 'g', 'er', 'man'],\n",
+       " 'anti-nato': ['▁', 'ant', 'i', '-', 'nato'],\n",
+       " 'anti-negro': ['▁', 'ant', 'i', '-', 'ne', 'g', 'ro'],\n",
+       " 'anti-nuclear': ['▁', 'ant', 'i', '-', 'n', 'u', 'c', 'le', 'ar'],\n",
+       " 'anti-soviet': ['▁', 'ant', 'i', '-', 's', 'o', 'v', 'i', 'et'],\n",
+       " 'anti-tory': ['▁', 'ant', 'i', '-', 'tory'],\n",
+       " 'anticipation': ['▁an', 'tic', 'ip', 'ation'],\n",
+       " 'antonioni': ['▁an', 'ton', 'ion', 'i'],\n",
+       " \"antonioni's\": ['▁an', 'ton', 'ion', 'i', \"'\", 's'],\n",
+       " 'any': ['▁any'],\n",
+       " 'any-': ['▁any', '-'],\n",
+       " 'anybody': ['▁any', 'body'],\n",
+       " \"anybody's\": ['▁any', 'body', \"'\", 's'],\n",
+       " 'anyone': ['▁any', 'on', 'e'],\n",
+       " 'anything': ['▁any', 'thing'],\n",
+       " 'apart': ['▁apart'],\n",
+       " 'apartheid': ['▁apart', 'hei', 'd'],\n",
+       " 'apathetic': ['▁a', 'p', 'at', 'h', 'e', 'tic'],\n",
+       " 'apathy': ['▁a', 'p', 'at', 'h', 'y'],\n",
+       " 'apex': ['▁a', 'p', 'e', 'x'],\n",
+       " 'apocalypse': ['▁a', 'p', 'o', 'c', 'al', 'y', 'p', 'se'],\n",
+       " 'apologising': ['▁a', 'p', 'o', 'lo', 'g', 'is', 'ing'],\n",
+       " 'appalled': ['▁', 'app', 'al', 'led'],\n",
+       " 'appalling': ['▁', 'app', 'al', 'l', 'ing'],\n",
+       " 'apparatus': ['▁', 'app', 'ar', 'at', 'us'],\n",
+       " 'apparent': ['▁', 'app', 'ar', 'ent'],\n",
+       " 'apparently': ['▁', 'app', 'ar', 'ent', 'ly'],\n",
+       " 'appeal': ['▁appeal'],\n",
+       " 'appealing': ['▁appeal', 'ing'],\n",
+       " 'appeals': ['▁appeal', 's'],\n",
+       " 'appear': ['▁appear'],\n",
+       " 'appearance': ['▁appear', 'ance'],\n",
+       " 'appeared': ['▁appear', 'ed'],\n",
+       " 'appears': ['▁appear', 's'],\n",
+       " 'appeasement': ['▁', 'app', 'e', 'a', 'se', 'ment'],\n",
+       " 'applauding': ['▁', 'app', 'l', 'a', 'ud', 'ing'],\n",
+       " 'appliances': ['▁', 'app', 'li', 'ance', 's'],\n",
+       " 'application': ['▁', 'app', 'l', 'ic', 'ation'],\n",
+       " 'applications': ['▁', 'app', 'l', 'ic', 'ation', 's'],\n",
+       " 'applied': ['▁', 'app', 'li', 'ed'],\n",
+       " 'apply': ['▁', 'app', 'ly'],\n",
+       " 'appointed': ['▁', 'app', 'o', 'in', 't', 'ed'],\n",
+       " 'appointment': ['▁', 'app', 'o', 'in', 't', 'ment'],\n",
+       " 'appreciable': ['▁', 'app', 're', 'c', 'i', 'able'],\n",
+       " 'appreciably': ['▁', 'app', 're', 'c', 'i', 'abl', 'y'],\n",
+       " 'appreciated': ['▁', 'app', 're', 'c', 'i', 'at', 'ed'],\n",
+       " 'apprenticeships': ['▁', 'app', 'ren', 'tic', 'e', 'ship', 's'],\n",
+       " 'approach': ['▁', 'app', 'ro', 'a', 'ch'],\n",
+       " 'approached': ['▁', 'app', 'ro', 'a', 'ch', 'ed'],\n",
+       " 'approaches': ['▁', 'app', 'ro', 'a', 'ches'],\n",
+       " 'appropriate': ['▁', 'app', 'ro', 'p', 'ri', 'at', 'e'],\n",
+       " 'appropriated': ['▁', 'app', 'ro', 'p', 'ri', 'at', 'ed'],\n",
+       " 'approval': ['▁', 'app', 'ro', 'val'],\n",
+       " 'approximately': ['▁', 'app', 'ro', 'x', 'i', 'm', 'at', 'e', 'ly'],\n",
+       " 'april': ['▁a', 'p', 'ri', 'l'],\n",
+       " 'archbishop': ['▁', 'ar', 'ch', 'b', 'ish', 'op'],\n",
+       " 'arches': ['▁', 'ar', 'ches'],\n",
+       " 'archipelago': ['▁', 'ar', 'ch', 'ip', 'el', 'ago'],\n",
+       " 'architect': ['▁', 'ar', 'ch', 'it', 'e', 'ct'],\n",
+       " 'architecture': ['▁', 'ar', 'ch', 'it', 'e', 'c', 'ture'],\n",
+       " 'are': ['▁are'],\n",
+       " 'area': ['▁are', 'a'],\n",
+       " 'areas': ['▁are', 'a', 's'],\n",
+       " 'arguably': ['▁argu', 'abl', 'y'],\n",
+       " 'argued': ['▁argu', 'ed'],\n",
+       " 'argues': ['▁argu', 'es'],\n",
+       " 'arguing': ['▁argu', 'ing'],\n",
+       " 'argument': ['▁argu', 'ment'],\n",
+       " 'arguments': ['▁argu', 'ment', 's'],\n",
+       " 'arise': ['▁', 'aris', 'e'],\n",
+       " 'arises': ['▁', 'aris', 'es'],\n",
+       " 'arm': ['▁arm'],\n",
+       " 'armament': ['▁', 'armament'],\n",
+       " 'armaments': ['▁', 'armament', 's'],\n",
+       " 'armed': ['▁arm', 'ed'],\n",
+       " 'armoured': ['▁arm', 'our', 'ed'],\n",
+       " 'arms': ['▁arm', 's'],\n",
+       " \"arms'\": ['▁arm', 's', \"'\"],\n",
+       " 'army': ['▁arm', 'y'],\n",
+       " 'arose': ['▁', 'ar', 'ose'],\n",
+       " 'around': ['▁a', 'round'],\n",
+       " 'aroused': ['▁', 'ar', 'ous', 'ed'],\n",
+       " 'arrange': ['▁arrang', 'e'],\n",
+       " 'arranged': ['▁arrang', 'ed'],\n",
+       " 'arrangement': ['▁arrang', 'e', 'ment'],\n",
+       " 'arrangements': ['▁arrang', 'e', 'ment', 's'],\n",
+       " 'arranging': ['▁arrang', 'ing'],\n",
+       " 'arrears': ['▁', 'ar', 're', 'ar', 's'],\n",
+       " 'arrested': ['▁', 'ar', 're', 'st', 'ed'],\n",
+       " 'arrival': ['▁arriv', 'al'],\n",
+       " 'arrive': ['▁arriv', 'e'],\n",
+       " 'arrived': ['▁arriv', 'ed'],\n",
+       " 'arrives': ['▁arriv', 'es'],\n",
+       " 'arrogant': ['▁', 'ar', 'ro', 'g', 'ant'],\n",
+       " 'art': ['▁', 'ar', 't'],\n",
+       " 'arthur': ['▁', 'ar', 'th', 'ur'],\n",
+       " 'article': ['▁', 'ar', 'tic', 'le'],\n",
+       " 'articles': ['▁', 'ar', 'tic', 'le', 's'],\n",
+       " 'articulation': ['▁', 'ar', 'tic', 'ul', 'ation'],\n",
+       " 'artistic': ['▁', 'ar', 'tist', 'ic'],\n",
+       " 'artists': ['▁', 'ar', 'tist', 's'],\n",
+       " 'as': ['▁as'],\n",
+       " 'ascents': ['▁as', 'c', 'ent', 's'],\n",
+       " 'ash': ['▁', 'ash'],\n",
+       " 'ashen': ['▁', 'ash', 'en'],\n",
+       " 'ask': ['▁ask'],\n",
+       " 'asked': ['▁ask', 'ed'],\n",
+       " 'asking': ['▁ask', 'ing'],\n",
+       " 'aspect': ['▁a', 'spect'],\n",
+       " 'aspects': ['▁a', 'spect', 's'],\n",
+       " 'assault': ['▁as', 's', 'a', 'ul', 't'],\n",
+       " 'assembler': ['▁as', 'se', 'm', 'ble', 'r'],\n",
+       " 'assembly': ['▁as', 'se', 'm', 'b', 'ly'],\n",
+       " 'assess': ['▁as', 'se', 's', 's'],\n",
+       " 'assessment': ['▁as', 'se', 's', 's', 'ment'],\n",
+       " 'assistance': ['▁as', 's', 'is', 't', 'ance'],\n",
+       " 'assistant': ['▁as', 's', 'is', 't', 'ant'],\n",
+       " 'assistants': ['▁as', 's', 'is', 't', 'ant', 's'],\n",
+       " 'associate': ['▁associat', 'e'],\n",
+       " 'associated': ['▁associat', 'ed'],\n",
+       " 'associates': ['▁associat', 'es'],\n",
+       " 'association': ['▁associat', 'ion'],\n",
+       " 'assortment': ['▁as', 's', 'or', 't', 'ment'],\n",
+       " 'assumption': ['▁assu', 'm', 'ption'],\n",
+       " 'assurance': ['▁assu', 'r', 'ance'],\n",
+       " 'astute': ['▁as', 'tu', 'te'],\n",
+       " 'at': ['▁at'],\n",
+       " 'ately': ['▁at', 'e', 'ly'],\n",
+       " 'atlantic': ['▁at', 'l', 'an', 'tic'],\n",
+       " 'atmosphere': ['▁atmospher', 'e'],\n",
+       " 'atmospheric': ['▁atmospher', 'ic'],\n",
+       " 'atomic': ['▁a', 'tom', 'ic'],\n",
+       " 'atoms': ['▁a', 'tom', 's'],\n",
+       " 'attach': ['▁at', 't', 'a', 'ch'],\n",
+       " 'attached': ['▁at', 't', 'a', 'ch', 'ed'],\n",
+       " 'attack': ['▁at', 'tack'],\n",
+       " 'attacked': ['▁at', 'tack', 'ed'],\n",
+       " 'attacks': ['▁at', 'tack', 's'],\n",
+       " 'attainable': ['▁at', 't', 'ain', 'able'],\n",
+       " 'attempt': ['▁attempt'],\n",
+       " 'attempted': ['▁attempt', 'ed'],\n",
+       " 'attempting': ['▁attempt', 'ing'],\n",
+       " 'attempts': ['▁attempt', 's'],\n",
+       " 'atten-': ['▁at', 'ten', '-'],\n",
+       " 'attend': ['▁at', 't', 'end'],\n",
+       " 'attendance': ['▁at', 't', 'end', 'ance'],\n",
+       " 'attended': ['▁at', 't', 'end', 'ed'],\n",
+       " 'attending': ['▁at', 't', 'end', 'ing'],\n",
+       " 'attention': ['▁at', 'ten', 'tion'],\n",
+       " 'attitude': ['▁at', 't', 'it', 'ud', 'e'],\n",
+       " 'attitudes': ['▁at', 't', 'it', 'ud', 'es'],\n",
+       " 'attracted': ['▁at', 't', 'ra', 'ct', 'ed'],\n",
+       " 'attractive': ['▁at', 't', 'ra', 'ct', 'ive'],\n",
+       " 'aubrey': ['▁a', 'u', 'b', 're', 'y'],\n",
+       " 'audacity': ['▁a', 'ud', 'a', 'c', 'it', 'y'],\n",
+       " 'auden': ['▁a', 'ud', 'en'],\n",
+       " 'audience': ['▁a', 'ud', 'i', 'ence'],\n",
+       " 'audited': ['▁a', 'ud', 'it', 'ed'],\n",
+       " 'august': ['▁a', 'ug', 'u', 'st'],\n",
+       " 'auntie': ['▁a', 'un', 't', 'ie'],\n",
+       " 'austerity': ['▁a', 'us', 'ter', 'it', 'y'],\n",
+       " 'australia': ['▁a', 'u', 'st', 'ra', 'li', 'a'],\n",
+       " 'austria': ['▁a', 'u', 'st', 'ri', 'a'],\n",
+       " 'austrian': ['▁a', 'u', 'st', 'ri', 'an'],\n",
+       " 'authentic': ['▁a', 'u', 'th', 'en', 'tic'],\n",
+       " 'author': ['▁', 'author'],\n",
+       " 'authorised': ['▁', 'author', 'is', 'ed'],\n",
+       " 'authorities': ['▁authorit', 'ies'],\n",
+       " 'authority': ['▁authorit', 'y'],\n",
+       " 'automatically': ['▁a', 'u', 'tom', 'at', 'ic', 'ally'],\n",
+       " 'automation': ['▁a', 'u', 'tom', 'ation'],\n",
+       " 'autumn': ['▁a', 'u', 't', 'um', 'n'],\n",
+       " 'available': ['▁a', 'v', 'a', 'il', 'able'],\n",
+       " 'avenue': ['▁a', 'v', 'en', 'u', 'e'],\n",
+       " 'average': ['▁a', 'ver', 'age'],\n",
+       " 'averages': ['▁a', 'ver', 'age', 's'],\n",
+       " 'avert': ['▁a', 'ver', 't'],\n",
+       " 'aviation': ['▁a', 'v', 'i', 'ation'],\n",
+       " 'avoid': ['▁a', 'v', 'o', 'i', 'd'],\n",
+       " 'avoided': ['▁a', 'v', 'o', 'i', 'd', 'ed'],\n",
+       " 'avon': ['▁a', 'v', 'on'],\n",
+       " 'awake': ['▁a', 'w', 'ak', 'e'],\n",
+       " 'awarded': ['▁a', 'ward', 'ed'],\n",
+       " 'awards': ['▁a', 'ward', 's'],\n",
+       " 'aware': ['▁a', 'w', 'ar', 'e'],\n",
+       " 'awareness': ['▁a', 'w', 'ar', 'e', 'ness'],\n",
+       " 'away': ['▁a', 'way'],\n",
+       " 'awful': ['▁a', 'w', 'ful'],\n",
+       " 'awfully': ['▁a', 'w', 'ful', 'ly'],\n",
+       " 'b': ['▁b'],\n",
+       " 'b.': ['▁b', '.'],\n",
+       " 'b.b.c.': ['▁b', '.', 'b', '.', 'c', '.'],\n",
+       " 'babe': ['▁ba', 'b', 'e'],\n",
+       " 'babel': ['▁ba', 'b', 'el'],\n",
+       " 'bably': ['▁b', 'abl', 'y'],\n",
+       " 'baby': ['▁ba', 'by'],\n",
+       " 'back': ['▁back'],\n",
+       " 'backbone': ['▁back', 'bo', 'ne'],\n",
+       " 'backed': ['▁back', 'ed'],\n",
+       " 'backers': ['▁back', 'ers'],\n",
+       " 'background': ['▁back', 'ground'],\n",
+       " 'backing': ['▁back', 'ing'],\n",
+       " 'backstage': ['▁back', 'st', 'age'],\n",
+       " 'backward': ['▁back', 'ward'],\n",
+       " 'bad': ['▁ba', 'd'],\n",
+       " 'badly': ['▁ba', 'd', 'ly'],\n",
+       " 'baffled': ['▁b', 'aff', 'led'],\n",
+       " 'bag': ['▁ba', 'g'],\n",
+       " 'bagaya': ['▁ba', 'g', 'a', 'y', 'a'],\n",
+       " 'baker': ['▁b', 'ak', 'er'],\n",
+       " 'balance': ['▁b', 'al', 'ance'],\n",
+       " 'balance-sheet': ['▁b', 'al', 'ance', '-', 's', 'h', 'e', 'et'],\n",
+       " 'balances': ['▁b', 'al', 'ance', 's'],\n",
+       " 'bald': ['▁ba', 'ld'],\n",
+       " 'ball': ['▁ba', 'll'],\n",
+       " 'balloon': ['▁b', 'al', 'lo', 'on'],\n",
+       " 'ballyhoo': ['▁b', 'ally', 'h', 'o', 'o'],\n",
+       " 'baltic': ['▁b', 'al', 'tic'],\n",
+       " 'ban': ['▁ba', 'n'],\n",
+       " 'ban-': ['▁ba', 'n', '-'],\n",
+       " 'ban-the-': ['▁ba', 'n', '-', 'th', 'e', '-'],\n",
+       " 'bank': ['▁bank'],\n",
+       " \"bank's\": ['▁bank', \"'\", 's'],\n",
+       " 'banking': ['▁bank', 'ing'],\n",
+       " 'bankrupt': ['▁bank', 'r', 'u', 'p', 't'],\n",
+       " 'banks': ['▁bank', 's'],\n",
+       " \"banks'\": ['▁bank', 's', \"'\"],\n",
+       " 'banned': ['▁ba', 'n', 'n', 'ed'],\n",
+       " 'banzie': ['▁ba', 'n', 'z', 'ie'],\n",
+       " 'bar': ['▁bar'],\n",
+       " 'barb': ['▁bar', 'b'],\n",
+       " 'barbarously': ['▁bar', 'b', 'ar', 'ous', 'ly'],\n",
+       " 'barclay': ['▁bar', 'c', 'lay'],\n",
+       " 'bare': ['▁bar', 'e'],\n",
+       " 'bargain': ['▁bar', 'g', 'ain'],\n",
+       " 'bargaining': ['▁bar', 'g', 'ain', 'ing'],\n",
+       " 'bark': ['▁bar', 'k'],\n",
+       " 'barrier': ['▁bar', 'ri', 'er'],\n",
+       " 'barriers': ['▁bar', 'ri', 'ers'],\n",
+       " 'barry': ['▁bar', 'ry'],\n",
+       " 'base': ['▁base'],\n",
+       " 'based': ['▁base', 'd'],\n",
+       " 'bases': ['▁base', 's'],\n",
+       " 'basic': ['▁basi', 'c'],\n",
+       " 'basin': ['▁basi', 'n'],\n",
+       " 'basing': ['▁ba', 's', 'ing'],\n",
+       " 'basis': ['▁basi', 's'],\n",
+       " 'baskerville': ['▁ba', 's', 'k', 'er', 'v', 'ill', 'e'],\n",
+       " 'basses': ['▁ba', 's', 'se', 's'],\n",
+       " 'basting': ['▁ba', 'st', 'ing'],\n",
+       " 'bats': ['▁b', 'at', 's'],\n",
+       " 'batsman': ['▁b', 'at', 's', 'man'],\n",
+       " 'battalions': ['▁b', 'at', 't', 'al', 'ion', 's'],\n",
+       " 'batting': ['▁b', 'at', 't', 'ing'],\n",
+       " 'battle': ['▁ba', 'ttle'],\n",
+       " 'bavaria': ['▁ba', 'v', 'ar', 'i', 'a'],\n",
+       " 'bavarian': ['▁ba', 'v', 'ar', 'ian'],\n",
+       " 'bavarians': ['▁ba', 'v', 'ar', 'ian', 's'],\n",
+       " 'bay': ['▁ba', 'y'],\n",
+       " 'be': ['▁be'],\n",
+       " 'beach': ['▁b', 'each'],\n",
+       " 'beaches': ['▁be', 'a', 'ches'],\n",
+       " 'beacon': ['▁be', 'a', 'c', 'on'],\n",
+       " 'beaks': ['▁be', 'ak', 's'],\n",
+       " 'bean': ['▁be', 'an'],\n",
+       " 'bear': ['▁be', 'ar'],\n",
+       " 'bearer': ['▁be', 'ar', 'er'],\n",
+       " 'bears': ['▁be', 'ar', 's'],\n",
+       " 'beastly': ['▁be', 'a', 'st', 'ly'],\n",
+       " 'beaten': ['▁be', 'at', 'en'],\n",
+       " 'beautiful': ['▁beautiful'],\n",
+       " 'beautifully': ['▁beautiful', 'ly'],\n",
+       " 'beauty': ['▁be', 'a', 'u', 'ty'],\n",
+       " 'became': ['▁be', 'c', 'a', 'me'],\n",
+       " 'because': ['▁be', 'cause'],\n",
+       " 'beckoning': ['▁be', 'ck', 'on', 'ing'],\n",
+       " 'become': ['▁become'],\n",
+       " 'becomes': ['▁become', 's'],\n",
+       " 'becoming': ['▁be', 'coming'],\n",
+       " 'bedlam': ['▁be', 'd', 'l', 'a', 'm'],\n",
+       " 'beds': ['▁be', 'd', 's'],\n",
+       " 'bedspreads': ['▁be', 'd', 's', 'p', 're', 'ad', 's'],\n",
+       " 'beech': ['▁be', 'e', 'ch'],\n",
+       " 'been': ['▁be', 'en'],\n",
+       " 'before': ['▁be', 'fore'],\n",
+       " 'befriended': ['▁be', 'f', 'ri', 'end', 'ed'],\n",
+       " 'began': ['▁be', 'g', 'an'],\n",
+       " 'begin': ['▁be', 'g', 'in'],\n",
+       " 'beginner': ['▁be', 'g', 'in', 'n', 'er'],\n",
+       " 'beginning': ['▁be', 'g', 'in', 'n', 'ing'],\n",
+       " 'begun': ['▁be', 'g', 'un'],\n",
+       " 'behan': ['▁be', 'h', 'an'],\n",
+       " 'behaviour': ['▁be', 'h', 'a', 'v', 'i', 'our'],\n",
+       " 'behind': ['▁be', 'h', 'ind'],\n",
+       " 'beier': ['▁be', 'i', 'er'],\n",
+       " 'being': ['▁be', 'ing'],\n",
+       " 'belgian': ['▁belgi', 'an'],\n",
+       " 'belgium': ['▁belgi', 'um'],\n",
+       " 'belgrade': ['▁be', 'l', 'g', 'rade'],\n",
+       " 'belief': ['▁be', 'li', 'e', 'f'],\n",
+       " 'believe': ['▁believe'],\n",
+       " 'believed': ['▁believe', 'd'],\n",
+       " 'believes': ['▁believe', 's'],\n",
+       " 'bell': ['▁be', 'll'],\n",
+       " \"bell's\": ['▁be', 'll', \"'\", 's'],\n",
+       " 'belonged': ['▁be', 'lo', 'ng', 'ed'],\n",
+       " 'belongs': ['▁be', 'lo', 'ng', 's'],\n",
+       " 'below': ['▁be', 'low'],\n",
+       " 'belt': ['▁be', 'l', 't'],\n",
+       " 'ben': ['▁be', 'n'],\n",
+       " 'bench': ['▁be', 'n', 'ch'],\n",
+       " 'benches': ['▁be', 'n', 'ches'],\n",
+       " 'bend': ['▁b', 'end'],\n",
+       " 'bending': ['▁b', 'end', 'ing'],\n",
+       " 'benefits': ['▁be', 'ne', 'f', 'it', 's'],\n",
+       " 'bent': ['▁b', 'ent'],\n",
+       " 'ber': ['▁be', 'r'],\n",
+       " 'berlin': ['▁berlin'],\n",
+       " \"berlin's\": ['▁berlin', \"'\", 's'],\n",
+       " 'bernhard': ['▁b', 'ern', 'h', 'ard'],\n",
+       " 'berry': ['▁be', 'rry'],\n",
+       " 'bertrand': ['▁be', 'r', 't', 'r', 'and'],\n",
+       " 'beset': ['▁be', 's', 'et'],\n",
+       " 'beside': ['▁be', 'side'],\n",
+       " 'best': ['▁best'],\n",
+       " 'best-seller': ['▁best', '-', 'se', 'll', 'er'],\n",
+       " 'bet': ['▁be', 't'],\n",
+       " 'betjeman': ['▁be', 't', 'j', 'e', 'man'],\n",
+       " 'betrayal': ['▁be', 't', 'ra', 'y', 'al'],\n",
+       " 'betrayed': ['▁be', 't', 'ra', 'y', 'ed'],\n",
+       " 'better': ['▁better'],\n",
+       " 'better-': ['▁better', '-'],\n",
+       " \"betti's\": ['▁be', 'tt', 'i', \"'\", 's'],\n",
+       " 'between': ['▁be', 'twe', 'en'],\n",
+       " 'bevel': ['▁be', 've', 'l'],\n",
+       " 'bevelled': ['▁be', 've', 'll', 'ed'],\n",
+       " 'beware': ['▁be', 'w', 'ar', 'e'],\n",
+       " 'bewildered': ['▁be', 'w', 'il', 'd', 'er', 'ed'],\n",
+       " 'beyond': ['▁be', 'y', 'on', 'd'],\n",
+       " 'bidet': ['▁b', 'i', 'd', 'et'],\n",
+       " 'big': ['▁big'],\n",
+       " 'bigger': ['▁big', 'g', 'er'],\n",
+       " 'biggest': ['▁big', 'g', 'est'],\n",
+       " 'bill': ['▁b', 'ill'],\n",
+       " 'bills': ['▁b', 'ill', 's'],\n",
+       " 'binding': ['▁b', 'ind', 'ing'],\n",
+       " 'biological': ['▁b', 'i', 'o', 'lo', 'g', 'ic', 'al'],\n",
+       " 'bird': ['▁b', 'ir', 'd'],\n",
+       " 'birds': ['▁b', 'ir', 'd', 's'],\n",
+       " 'bishop': ['▁b', 'ish', 'op'],\n",
+       " 'bit': ['▁', 'bit'],\n",
+       " 'bite': ['▁', 'bit', 'e'],\n",
+       " 'bits': ['▁', 'bit', 's'],\n",
+       " 'bitter-sweet': ['▁', 'bit', 'ter', '-', 's', 'w', 'e', 'et'],\n",
+       " 'bitterest': ['▁', 'bit', 'ter', 'est'],\n",
+       " 'bitterly': ['▁', 'bit', 'ter', 'ly'],\n",
+       " 'bituminized': ['▁', 'bit', 'um', 'in', 'i', 'z', 'ed'],\n",
+       " 'black': ['▁b', 'l', 'a', 'ck'],\n",
+       " 'black-': ['▁b', 'l', 'a', 'ck', '-'],\n",
+       " 'blackbird': ['▁b', 'l', 'a', 'ck', 'b', 'ir', 'd'],\n",
+       " 'blacks': ['▁b', 'l', 'a', 'ck', 's'],\n",
+       " 'blame': ['▁b', 'l', 'a', 'me'],\n",
+       " 'blamed': ['▁b', 'l', 'a', 'm', 'ed'],\n",
+       " 'blander': ['▁b', 'land', 'er'],\n",
+       " 'blank': ['▁b', 'l', 'an', 'k'],\n",
+       " 'blight': ['▁b', 'l', 'ight'],\n",
+       " 'blind': ['▁b', 'lin', 'd'],\n",
+       " 'blinked': ['▁b', 'lin', 'k', 'ed'],\n",
+       " 'block': ['▁block'],\n",
+       " 'blocks': ['▁block', 's'],\n",
+       " 'bloem-': ['▁b', 'lo', 'e', 'm', '-'],\n",
+       " 'blond': ['▁b', 'l', 'on', 'd'],\n",
+       " 'blood': ['▁b', 'l', 'ood'],\n",
+       " 'bloodstained': ['▁b', 'l', 'ood', 'st', 'ain', 'ed'],\n",
+       " 'bloody': ['▁b', 'l', 'ood', 'y'],\n",
+       " 'blouse': ['▁b', 'l', 'ous', 'e'],\n",
+       " 'blouses': ['▁b', 'l', 'ous', 'es'],\n",
+       " 'blow': ['▁b', 'low'],\n",
+       " 'blowflies': ['▁b', 'low', 'f', 'l', 'ies'],\n",
+       " 'blown': ['▁b', 'low', 'n'],\n",
+       " 'blue': ['▁b', 'l', 'u', 'e'],\n",
+       " 'blunt': ['▁b', 'l', 'un', 't'],\n",
+       " 'bluntly': ['▁b', 'l', 'un', 't', 'ly'],\n",
+       " 'bluster': ['▁b', 'l', 'us', 'ter'],\n",
+       " 'board': ['▁bo', 'ard'],\n",
+       " 'boat': ['▁bo', 'at'],\n",
+       " 'boat-train': ['▁bo', 'at', '-', 'train'],\n",
+       " 'bobby': ['▁bo', 'b', 'by'],\n",
+       " 'bodies': ['▁bo', 'd', 'ies'],\n",
+       " 'body': ['▁', 'body'],\n",
+       " 'boeing': ['▁bo', 'e', 'ing'],\n",
+       " 'bogy': ['▁bo', 'g', 'y'],\n",
+       " 'boiled': ['▁bo', 'il', 'ed'],\n",
+       " 'boils': ['▁bo', 'il', 's'],\n",
+       " 'bold': ['▁bo', 'ld'],\n",
+       " 'boldly': ['▁bo', 'ld', 'ly'],\n",
+       " 'bolted': ['▁bo', 'l', 't', 'ed'],\n",
+       " 'bomb': ['▁', 'bomb'],\n",
+       " 'bombay': ['▁', 'bomb', 'a', 'y'],\n",
+       " 'bombed': ['▁', 'bomb', 'ed'],\n",
+       " 'bombers': ['▁bo', 'mber', 's'],\n",
+       " 'bonded': ['▁bo', 'n', 'd', 'ed'],\n",
+       " 'bone': ['▁bo', 'ne'],\n",
+       " 'bones': ['▁bo', 'ne', 's'],\n",
+       " 'bonn': ['▁bo', 'n', 'n'],\n",
+       " 'book': ['▁bo', 'ok'],\n",
+       " 'booklet': ['▁bo', 'ok', 'le', 't'],\n",
+       " 'books': ['▁bo', 'ok', 's'],\n",
+       " 'booming': ['▁bo', 'o', 'm', 'ing'],\n",
+       " 'border': ['▁b', 'or', 'd', 'er'],\n",
+       " 'bore': ['▁bo', 're'],\n",
+       " 'bored': ['▁b', 'or', 'ed'],\n",
+       " 'boredom': ['▁b', 'or', 'ed', 'o', 'm'],\n",
+       " 'bores': ['▁bo', 're', 's'],\n",
+       " 'borough': ['▁b', 'or', 'ough'],\n",
+       " 'borrow': ['▁b', 'or', 'ro', 'w'],\n",
+       " 'borstal': ['▁b', 'or', 'st', 'al'],\n",
+       " 'bosoms': ['▁bo', 's', 'o', 'm', 's'],\n",
+       " 'bossed': ['▁bo', 's', 's', 'ed'],\n",
+       " 'bosses': ['▁bo', 's', 'se', 's'],\n",
+       " 'both': ['▁bo', 'th'],\n",
+       " 'bottle': ['▁bo', 'ttle'],\n",
+       " 'bottom': ['▁bo', 't', 'tom'],\n",
+       " 'bought': ['▁b', 'ought'],\n",
+       " 'boun': ['▁bo', 'un'],\n",
+       " 'bound': ['▁b', 'ound'],\n",
+       " 'boutiques': ['▁b', 'out', 'i', 'q', 'u', 'es'],\n",
+       " 'bow': ['▁bo', 'w'],\n",
+       " 'bow-street': ['▁bo', 'w', '-', 'street'],\n",
+       " 'bowed': ['▁bo', 'w', 'ed'],\n",
+       " 'bowing': ['▁bo', 'w', 'ing'],\n",
+       " 'bows': ['▁bo', 'w', 's'],\n",
+       " 'box': ['▁bo', 'x'],\n",
+       " 'boxes': ['▁bo', 'x', 'es'],\n",
+       " 'boxing': ['▁bo', 'x', 'ing'],\n",
+       " 'boy': ['▁bo', 'y'],\n",
+       " 'boycotted': ['▁bo', 'y', 'c', 'o', 'tt', 'ed'],\n",
+       " 'boycotting': ['▁bo', 'y', 'c', 'o', 'tt', 'ing'],\n",
+       " 'boyle': ['▁bo', 'y', 'le'],\n",
+       " 'boys': ['▁bo', 'y', 's'],\n",
+       " 'braces': ['▁br', 'a', 'ce', 's'],\n",
+       " 'brain': ['▁br', 'ain'],\n",
+       " 'brain-activity': ['▁br', 'ain', '-', 'a', 'ct', 'i', 'v', 'it', 'y'],\n",
+       " 'brain-children': ['▁br', 'ain', '-', 'ch', 'il', 'd', 'ren'],\n",
+       " 'brains': ['▁br', 'ain', 's'],\n",
+       " 'brandy': ['▁br', 'and', 'y'],\n",
+       " 'brash': ['▁br', 'ash'],\n",
+       " 'brass': ['▁br', 'a', 's', 's'],\n",
+       " 'brauchitsch': ['▁br', 'a', 'uch', 'it', 's', 'ch'],\n",
+       " 'breach': ['▁br', 'each'],\n",
+       " 'bread-and-butter': ['▁br', 'e', 'ad', '-', 'and', '-', 'b', 'u', 't', 'ter'],\n",
+       " 'break': ['▁br', 'e', 'ak'],\n",
+       " 'breaking': ['▁br', 'e', 'ak', 'ing'],\n",
+       " 'breaks': ['▁br', 'e', 'ak', 's'],\n",
+       " 'breath': ['▁br', 'e', 'at', 'h'],\n",
+       " 'breathing': ['▁br', 'e', 'a', 'thing'],\n",
+       " 'breathless': ['▁br', 'e', 'at', 'h', 'less'],\n",
+       " 'breeding': ['▁br', 'e', 'ed', 'ing'],\n",
+       " 'breezily': ['▁br', 'e', 'e', 'z', 'i', 'ly'],\n",
+       " 'brenda': ['▁br', 'end', 'a'],\n",
+       " 'brendan': ['▁br', 'end', 'an'],\n",
+       " \"brendan's\": ['▁br', 'end', 'an', \"'\", 's'],\n",
+       " 'brentano': ['▁br', 'ent', 'an', 'o'],\n",
+       " 'brezhnev': ['▁br', 'e', 'z', 'h', 'ne', 'v'],\n",
+       " 'brian': ['▁br', 'ian'],\n",
+       " 'bridal': ['▁', 'brid', 'al'],\n",
+       " 'bride': ['▁', 'brid', 'e'],\n",
+       " 'brief': ['▁brief'],\n",
+       " 'briefcase': ['▁brief', 'case'],\n",
+       " 'briefing': ['▁brief', 'ing'],\n",
+       " 'brigadiers': ['▁br', 'ig', 'ad', 'i', 'ers'],\n",
+       " 'bright': ['▁br', 'ight'],\n",
+       " 'brighter': ['▁br', 'ight', 'er'],\n",
+       " 'brightly': ['▁br', 'ight', 'ly'],\n",
+       " \"brighton's\": ['▁br', 'ight', 'on', \"'\", 's'],\n",
+       " 'brilliant': ['▁br', 'ill', 'ian', 't'],\n",
+       " 'brilliantly': ['▁br', 'ill', 'ian', 't', 'ly'],\n",
+       " 'bring': ['▁br', 'ing'],\n",
+       " 'brings': ['▁br', 'ing', 's'],\n",
+       " 'bristled': ['▁br', 'is', 't', 'led'],\n",
+       " 'britain': ['▁britain'],\n",
+       " \"britain's\": ['▁britain', \"'\", 's'],\n",
+       " 'british': ['▁british'],\n",
+       " 'british-owned': ['▁british', '-', 'ow', 'n', 'ed'],\n",
+       " 'britishers': ['▁british', 'ers'],\n",
+       " 'brittle': ['▁br', 'i', 'ttle'],\n",
+       " 'broad': ['▁br', 'o', 'ad'],\n",
+       " 'broadcast': ['▁br', 'o', 'ad', 'c', 'a', 'st'],\n",
+       " 'broadcasting': ['▁br', 'o', 'ad', 'c', 'a', 'st', 'ing'],\n",
+       " 'broke': ['▁br', 'ok', 'e'],\n",
+       " 'broken': ['▁br', 'ok', 'en'],\n",
+       " \"brook's\": ['▁br', 'o', 'ok', \"'\", 's'],\n",
+       " 'brother': ['▁br', 'other'],\n",
+       " 'brother-': ['▁br', 'other', '-'],\n",
+       " 'brother-in-law': ['▁br', 'other', '-', 'in', '-', 'law'],\n",
+       " 'brought': ['▁br', 'ought'],\n",
+       " 'brown': ['▁brown'],\n",
+       " \"brown's\": ['▁brown', \"'\", 's'],\n",
+       " 'bru\"cke': ['▁br', 'u', '\"', 'ck', 'e'],\n",
+       " 'bruce': ['▁br', 'u', 'ce'],\n",
+       " 'bruno': ['▁br', 'un', 'o'],\n",
+       " 'brunswick': ['▁br', 'un', 's', 'w', 'i', 'ck'],\n",
+       " 'brussels': ['▁br', 'us', 's', 'el', 's'],\n",
+       " 'brutal': ['▁br', 'u', 't', 'al'],\n",
+       " 'bryan': ['▁br', 'y', 'an'],\n",
+       " 'buck': ['▁b', 'u', 'ck'],\n",
+       " 'buckingham': ['▁b', 'u', 'ck', 'ing', 'h', 'a', 'm'],\n",
+       " 'buckley': ['▁b', 'u', 'ck', 'le', 'y'],\n",
+       " 'budge': ['▁b', 'ud', 'ge'],\n",
+       " 'budgerigar': ['▁b', 'ud', 'g', 'er', 'ig', 'ar'],\n",
+       " 'budget': ['▁b', 'ud', 'ge', 't'],\n",
+       " 'budgetary': ['▁b', 'ud', 'ge', 't', 'ar', 'y'],\n",
+       " 'budgette': ['▁b', 'ud', 'ge', 't', 'te'],\n",
+       " 'buganda': ['▁b', 'ug', 'and', 'a'],\n",
+       " 'build': ['▁b', 'u', 'il', 'd'],\n",
+       " 'building': ['▁b', 'u', 'il', 'd', 'ing'],\n",
+       " 'built': ['▁b', 'u', 'il', 't'],\n",
+       " 'bulk': ['▁b', 'ul', 'k'],\n",
+       " 'bulky': ['▁b', 'ul', 'k', 'y'],\n",
+       " 'bulletins': ['▁b', 'ul', 'le', 't', 'in', 's'],\n",
+       " 'bullying': ['▁b', 'ul', 'ly', 'ing'],\n",
+       " 'bumps': ['▁b', 'um', 'p', 's'],\n",
+       " 'bundle': ['▁b', 'un', 'd', 'le'],\n",
+       " 'bundy': ['▁b', 'un', 'd', 'y'],\n",
+       " 'bunk': ['▁b', 'un', 'k'],\n",
+       " 'burden': ['▁burde', 'n'],\n",
+       " 'burdens': ['▁burde', 'n', 's'],\n",
+       " 'burdett': ['▁burde', 'tt'],\n",
+       " 'bureau': ['▁b', 'ure', 'a', 'u'],\n",
+       " 'burge': ['▁b', 'ur', 'ge'],\n",
+       " 'burning': ['▁b', 'ur', 'n', 'ing'],\n",
+       " 'burst': ['▁b', 'ur', 'st'],\n",
+       " 'bush': ['▁bus', 'h'],\n",
+       " 'busi-': ['▁bus', 'i', '-'],\n",
+       " 'business': ['▁bus', 'i', 'ness'],\n",
+       " 'bustling': ['▁bus', 't', 'l', 'ing'],\n",
+       " 'busy': ['▁bus', 'y'],\n",
+       " 'but': ['▁but'],\n",
+       " 'butagas': ['▁but', 'a', 'g', 'a', 's'],\n",
+       " 'butler': ['▁but', 'l', 'er'],\n",
+       " 'butt': ['▁but', 't'],\n",
+       " 'buttery': ['▁but', 'ter', 'y'],\n",
+       " 'button': ['▁but', 'ton'],\n",
+       " 'buy': ['▁b', 'u', 'y'],\n",
+       " 'buying': ['▁b', 'u', 'y', 'ing'],\n",
+       " 'by': ['▁by'],\n",
+       " 'by-': ['▁by', '-'],\n",
+       " 'by-elections': ['▁by', '-', 'elect', 'ion', 's'],\n",
+       " 'by-passed': ['▁by', '-', 'p', 'a', 's', 's', 'ed'],\n",
+       " 'byrne': ['▁by', 'r', 'ne'],\n",
+       " 'byron': ['▁by', 'ro', 'n'],\n",
+       " 'c': ['▁c'],\n",
+       " 'c.c.f.': ['▁c', '.', 'c', '.', 'f', '.'],\n",
+       " 'c.i.g.s.': ['▁c', '.', 'i', '.', 'g', '.', 's', '.'],\n",
+       " 'cabinet': ['▁c', 'a', 'b', 'ine', 't'],\n",
+       " 'cable': ['▁c', 'able'],\n",
+       " 'cables': ['▁c', 'able', 's'],\n",
+       " 'cagnes': ['▁c', 'a', 'g', 'ne', 's'],\n",
+       " 'call': ['▁call'],\n",
+       " 'call-out': ['▁call', '-', 'out'],\n",
+       " 'callaghan': ['▁call', 'a', 'g', 'h', 'an'],\n",
+       " 'called': ['▁call', 'ed'],\n",
+       " 'calls': ['▁call', 's'],\n",
+       " 'cally': ['▁call', 'y'],\n",
+       " 'calm': ['▁c', 'al', 'm'],\n",
+       " 'calorie': ['▁c', 'al', 'or', 'ie'],\n",
+       " 'cambodia': ['▁cam', 'bo', 'd', 'i', 'a'],\n",
+       " 'cambridge': ['▁cam', 'brid', 'ge'],\n",
+       " 'came': ['▁cam', 'e'],\n",
+       " 'camp': ['▁cam', 'p'],\n",
+       " 'campaign': ['▁campaign'],\n",
+       " 'campaigns': ['▁campaign', 's'],\n",
+       " 'can': ['▁can'],\n",
+       " \"can't\": ['▁can', \"'\", 't'],\n",
+       " 'can-': ['▁can', '-'],\n",
+       " 'canada': ['▁can', 'ad', 'a'],\n",
+       " 'canadian': ['▁can', 'ad', 'ian'],\n",
+       " 'canadians': ['▁can', 'ad', 'ian', 's'],\n",
+       " 'canals': ['▁can', 'al', 's'],\n",
+       " 'canard': ['▁can', 'ard'],\n",
+       " 'cancelled': ['▁can', 'ce', 'll', 'ed'],\n",
+       " 'cancerous': ['▁can', 'c', 'er', 'ous'],\n",
+       " 'candi-': ['▁can', 'd', 'i', '-'],\n",
+       " 'candidate': ['▁can', 'did', 'at', 'e'],\n",
+       " 'candles': ['▁can', 'd', 'le', 's'],\n",
+       " 'cannot': ['▁can', 'not'],\n",
+       " 'canon': ['▁can', 'on'],\n",
+       " 'cantabile': ['▁can', 't', 'a', 'b', 'il', 'e'],\n",
+       " 'canteen': ['▁can', 'te', 'en'],\n",
+       " ...}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processor.lexicon"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 167,
diff --git a/notebooks/04b-look-at-iam-paragraphs.ipynb b/notebooks/04b-look-at-iam-paragraphs.ipynb
index dc0aef6..11ebddf 100644
--- a/notebooks/04b-look-at-iam-paragraphs.ipynb
+++ b/notebooks/04b-look-at-iam-paragraphs.ipynb
@@ -256,7 +256,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.2"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/text_recognizer/data/iam_preprocessor.py b/text_recognizer/data/iam_preprocessor.py
index d85787e..60f8a9f 100644
--- a/text_recognizer/data/iam_preprocessor.py
+++ b/text_recognizer/data/iam_preprocessor.py
@@ -119,7 +119,6 @@ class Preprocessor:
                     continue
                 self.text.append(example["text"].lower())
 
-
     def _to_index(self, line: str) -> torch.LongTensor:
         if line in self.special_tokens:
             return torch.LongTensor([self.tokens_to_index[line]])
diff --git a/text_recognizer/data/mapping.py b/text_recognizer/data/mapping.py
deleted file mode 100644
index f0edf3f..0000000
--- a/text_recognizer/data/mapping.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""Mapping to and from word pieces."""
-from pathlib import Path
-
-
-class WordPieces:
-
-    def __init__(self) -> None:
-        pass
diff --git a/text_recognizer/data/mappings.py b/text_recognizer/data/mappings.py
new file mode 100644
index 0000000..cfa0ec7
--- /dev/null
+++ b/text_recognizer/data/mappings.py
@@ -0,0 +1,143 @@
+"""Mapping to and from word pieces."""
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List, Optional, Union, Sequence
+
+from loguru import logger
+import torch
+from torch import Tensor
+
+from text_recognizer.data.emnist import emnist_mapping
+from text_recognizer.datasets.iam_preprocessor import Preprocessor
+
+
+class AbstractMapping(ABC):
+    @abstractmethod
+    def get_token(self, *args, **kwargs) -> str:
+        ...
+
+    @abstractmethod
+    def get_index(self, *args, **kwargs) -> Tensor:
+        ...
+
+    @abstractmethod
+    def get_text(self, *args, **kwargs) -> str:
+        ...
+
+    @abstractmethod
+    def get_indices(self, *args, **kwargs) -> Tensor:
+        ...
+
+
+class EmnistMapping(AbstractMapping):
+    def __init__(self, extra_symbols: Optional[Sequence[str]]) -> None:
+        self.mapping, self.inverse_mapping, self.input_size = emnist_mapping(
+            extra_symbols
+        )
+
+    def get_token(self, index: Union[int, Tensor]) -> str:
+        if (index := int(index)) in self.mapping:
+            return self.mapping[index]
+        raise KeyError(f"Index ({index}) not in mapping.")
+
+    def get_index(self, token: str) -> Tensor:
+        if token in self.inverse_mapping:
+            return Tensor(self.inverse_mapping[token])
+        raise KeyError(f"Token ({token}) not found in inverse mapping.")
+
+    def get_text(self, indices: Union[List[int], Tensor]) -> str:
+        if isinstance(indices, Tensor):
+            indices = indices.tolist()
+        return "".join([self.mapping[index] for index in indices])
+
+    def get_indices(self, text: str) -> Tensor:
+        return Tensor([self.inverse_mapping[token] for token in text])
+
+
+class WordPieceMapping(EmnistMapping):
+    def __init__(
+        self,
+        num_features: int,
+        tokens: str,
+        lexicon: str,
+        data_dir: Optional[Union[str, Path]] = None,
+        use_words: bool = False,
+        prepend_wordsep: bool = False,
+        special_tokens: Sequence[str] = ("<s>", "<e>", "<p>"),
+        extra_symbols: Optional[Sequence[str]] = None,
+    ) -> None:
+        super().__init__(extra_symbols)
+        self.wordpiece_processor = self._configure_wordpiece_processor(
+            num_features,
+            tokens,
+            lexicon,
+            data_dir,
+            use_words,
+            prepend_wordsep,
+            special_tokens,
+            extra_symbols,
+        )
+
+    def _configure_wordpiece_processor(
+        self,
+        num_features: int,
+        tokens: str,
+        lexicon: str,
+        data_dir: Optional[Union[str, Path]],
+        use_words: bool,
+        prepend_wordsep: bool,
+        special_tokens: Optional[Sequence[str]],
+        extra_symbols: Optional[Sequence[str]],
+    ) -> Preprocessor:
+        data_dir = (
+            (Path(__file__).resolve().parents[2] / "data" / "raw" / "iam" / "iamdb")
+            if data_dir is None
+            else Path(data_dir)
+        )
+
+        logger.debug(f"Using data dir: {data_dir}")
+        if not data_dir.exists():
+            raise RuntimeError(f"Could not locate iamdb directory at {data_dir}")
+
+        processed_path = (
+            Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines"
+        )
+
+        tokens_path = processed_path / tokens
+        lexicon_path = processed_path / lexicon
+
+        if extra_symbols is not None:
+            special_tokens += extra_symbols
+
+        return Preprocessor(
+            data_dir,
+            num_features,
+            tokens_path,
+            lexicon_path,
+            use_words,
+            prepend_wordsep,
+            special_tokens,
+        )
+
+    def get_token(self, index: Union[int, Tensor]) -> str:
+        if (index := int(index)) <= self.wordpiece_processor.num_tokens:
+            return self.wordpiece_processor.tokens[index]
+        raise KeyError(f"Index ({index}) not in mapping.")
+
+    def get_index(self, token: str) -> Tensor:
+        if token in self.wordpiece_processor.tokens:
+            return torch.LongTensor(self.wordpiece_processor.tokens_to_index[token])
+        raise KeyError(f"Token ({token}) not found in inverse mapping.")
+
+    def get_text(self, indices: Union[List[int], Tensor]) -> str:
+        if isinstance(indices, Tensor):
+            indices = indices.tolist()
+        return self.wordpiece_processor.to_text(indices)
+
+    def get_indices(self, text: str) -> Tensor:
+        return self.wordpiece_processor.to_index(text)
+
+    def emnist_to_wordpiece_indices(self, x: Tensor) -> Tensor:
+        text = self.mapping.get_text(x)
+        text = text.lower().replace(" ", "▁")
+        return torch.LongTensor(self.wordpiece_processor.to_index(text))
diff --git a/text_recognizer/data/transforms.py b/text_recognizer/data/transforms.py
index 297c953..f53df64 100644
--- a/text_recognizer/data/transforms.py
+++ b/text_recognizer/data/transforms.py
@@ -1,115 +1,36 @@
 """Transforms for PyTorch datasets."""
-from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Optional, Union, Sequence
 
-from loguru import logger
-import torch
 from torch import Tensor
 
-from text_recognizer.datasets.iam_preprocessor import Preprocessor
-from text_recognizer.data.emnist import emnist_mapping
+from text_recognizer.datasets.mappings import WordPieceMapping
 
 
-class ToLower:
-    """Converts target to lower case."""
-
-    def __call__(self, target: Tensor) -> Tensor:
-        """Corrects index value in target tensor."""
-        device = target.device
-        return torch.stack([x - 26 if x > 35 else x for x in target]).to(device)
-
-
-class ToCharcters:
-    """Converts integers to characters."""
-
-    def __init__(self, extra_symbols: Optional[List[str]] = None) -> None:
-        self.mapping, _, _ = emnist_mapping(extra_symbols)
-
-    def __call__(self, y: Tensor) -> str:
-        """Converts a Tensor to a str."""
-        return "".join([self.mapping[int(i)] for i in y]).replace(" ", "▁")
-
-
-class WordPieces:
-    """Abstract transform for word pieces."""
+class WordPiece:
+    """Converts EMNIST indices to Word Piece indices."""
 
     def __init__(
         self,
         num_features: int,
+        tokens: str,
+        lexicon: str,
         data_dir: Optional[Union[str, Path]] = None,
-        tokens: Optional[Union[str, Path]] = None,
-        lexicon: Optional[Union[str, Path]] = None,
         use_words: bool = False,
         prepend_wordsep: bool = False,
+        special_tokens: Sequence[str] = ("<s>", "<e>", "<p>"),
+        extra_symbols: Optional[Sequence[str]] = None,
     ) -> None:
-        if data_dir is None:
-            data_dir = (
-                Path(__file__).resolve().parents[3] / "data" / "raw" / "iam" / "iamdb"
-            )
-            logger.debug(f"Using data dir: {data_dir}")
-            if not data_dir.exists():
-                raise RuntimeError(f"Could not locate iamdb directory at {data_dir}")
-        else:
-            data_dir = Path(data_dir)
-        processed_path = (
-            Path(__file__).resolve().parents[3] / "data" / "processed" / "iam_lines"
-        )
-        tokens_path = processed_path / tokens
-        lexicon_path = processed_path / lexicon
-
-        self.preprocessor = Preprocessor(
-            data_dir,
+        self.mapping = WordPieceMapping(
             num_features,
-            tokens_path,
-            lexicon_path,
+            tokens,
+            lexicon,
+            data_dir,
             use_words,
             prepend_wordsep,
+            special_tokens,
+            extra_symbols,
         )
 
-    @abstractmethod
-    def __call__(self, *args, **kwargs) -> Any:
-        """Transforms input."""
-        ...
-
-
-class ToWordPieces(WordPieces):
-    """Transforms str to word pieces."""
-
-    def __init__(
-        self,
-        num_features: int,
-        data_dir: Optional[Union[str, Path]] = None,
-        tokens: Optional[Union[str, Path]] = None,
-        lexicon: Optional[Union[str, Path]] = None,
-        use_words: bool = False,
-        prepend_wordsep: bool = False,
-    ) -> None:
-        super().__init__(
-            num_features, data_dir, tokens, lexicon, use_words, prepend_wordsep
-        )
-
-    def __call__(self, line: str) -> Tensor:
-        """Transforms str to word pieces."""
-        return self.preprocessor.to_index(line)
-
-
-class ToText(WordPieces):
-    """Takes word pieces and converts them to text."""
-
-    def __init__(
-        self,
-        num_features: int,
-        data_dir: Optional[Union[str, Path]] = None,
-        tokens: Optional[Union[str, Path]] = None,
-        lexicon: Optional[Union[str, Path]] = None,
-        use_words: bool = False,
-        prepend_wordsep: bool = False,
-    ) -> None:
-        super().__init__(
-            num_features, data_dir, tokens, lexicon, use_words, prepend_wordsep
-        )
-
-    def __call__(self, x: Tensor) -> str:
-        """Converts tensor to text."""
-        return self.preprocessor.to_text(x.tolist())
+    def __call__(self, x: Tensor) -> Tensor:
+        return self.mapping.emnist_to_wordpiece_indices(x)
diff --git a/text_recognizer/models/base.py b/text_recognizer/models/base.py
index c6d5d73..aeda039 100644
--- a/text_recognizer/models/base.py
+++ b/text_recognizer/models/base.py
@@ -49,7 +49,9 @@ class LitBaseModel(pl.LightningModule):
             optimizer_class = getattr(torch.optim, self._optimizer.type)
         return optimizer_class(params=self.parameters(), **args)
 
-    def _configure_lr_scheduler(self, optimizer: Type[torch.optim.Optimizer]) -> Dict[str, Any]:
+    def _configure_lr_scheduler(
+        self, optimizer: Type[torch.optim.Optimizer]
+    ) -> Dict[str, Any]:
         """Configures the lr scheduler."""
         scheduler = {"monitor": self.monitor}
         args = {} or self._lr_scheduler.args
@@ -59,7 +61,7 @@ class LitBaseModel(pl.LightningModule):
 
         scheduler["scheduler"] = getattr(
             torch.optim.lr_scheduler, self._lr_scheduler.type
-            )(optimizer, **args)
+        )(optimizer, **args)
 
         return scheduler
 
diff --git a/text_recognizer/networks/image_transformer.py b/text_recognizer/networks/image_transformer.py
index daededa..a6aaca4 100644
--- a/text_recognizer/networks/image_transformer.py
+++ b/text_recognizer/networks/image_transformer.py
@@ -44,7 +44,9 @@ class ImageTransformer(nn.Module):
         dropout_rate: float = 0.1,
         transformer_activation: str = "glu",
     ) -> None:
-        self.vocab_size = NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size
+        self.vocab_size = (
+            NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size
+        )
         self.hidden_dim = hidden_dim
         self.max_output_length = output_shape[0]
 
diff --git a/training/configs/image_transformer.yaml b/training/configs/image_transformer.yaml
index 88c05c2..228e53f 100644
--- a/training/configs/image_transformer.yaml
+++ b/training/configs/image_transformer.yaml
@@ -56,6 +56,7 @@ callbacks:
           args:
                   monitor: val_loss
                   mode: min
+                  save_last: true
         - type: StochasticWeightAveraging
           args:
                   swa_epoch_start: 0.8
diff --git a/training/run_experiment.py b/training/run_experiment.py
index f46803f..1e79461 100644
--- a/training/run_experiment.py
+++ b/training/run_experiment.py
@@ -15,7 +15,7 @@ import wandb
 
 SEED = 4711
 CONFIGS_DIRNAME = Path(__file__).parent.resolve() / "configs"
-LOGS_DIRNAME = Path(__file__).parent.resolve() / "runs" / "logs"
+LOGS_DIRNAME = Path(__file__).parent.resolve() / "logs"
 
 
 def _configure_logging(log_dir: Optional[Path], verbose: int = 0) -> None:
@@ -67,7 +67,7 @@ def _configure_callbacks(
 
 
 def _configure_logger(
-        network: Type[nn.Module], args: Dict, log_dir: str, use_wandb: bool
+    network: Type[nn.Module], args: Dict, log_dir: str, use_wandb: bool
 ) -> Type[pl.loggers.LightningLoggerBase]:
     """Configures lightning logger."""
     if use_wandb:
@@ -124,7 +124,6 @@ def run(
     _configure_logging(None, verbose=verbose)
     logger.info("Starting experiment...")
 
-
     # Seed everything in the experiment.
     logger.info(f"Seeding everthing with seed={SEED}")
     pl.utilities.seed.seed_everything(SEED)
-- 
cgit v1.2.3-70-g09d2