summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2021-04-15 22:05:24 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2021-04-15 22:05:24 +0200
commitffa4be4bf4e3758e01d52a9c1f354a05a90b93de (patch)
tree33a0f51f60cb181ffc562b562a7d09fcc70c5cf5
parent0ab820d3595e4f17d4f1f3c310e186692f65cc67 (diff)
Created mappings
-rw-r--r--notebooks/03-look-at-iam-paragraphs.ipynb2126
-rw-r--r--notebooks/04b-look-at-iam-paragraphs.ipynb2
-rw-r--r--text_recognizer/data/iam_preprocessor.py1
-rw-r--r--text_recognizer/data/mapping.py8
-rw-r--r--text_recognizer/data/mappings.py143
-rw-r--r--text_recognizer/data/transforms.py111
-rw-r--r--text_recognizer/models/base.py6
-rw-r--r--text_recognizer/networks/image_transformer.py4
-rw-r--r--training/configs/image_transformer.yaml1
-rw-r--r--training/run_experiment.py5
10 files changed, 2293 insertions, 114 deletions
diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb
index cfa0ba5..7599e11 100644
--- a/notebooks/03-look-at-iam-paragraphs.ipynb
+++ b/notebooks/03-look-at-iam-paragraphs.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"id": "6ce2519f",
"metadata": {},
"outputs": [],
@@ -212,7 +212,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"id": "45649194",
"metadata": {},
"outputs": [],
@@ -223,7 +223,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 11,
"id": "0fc13f9f",
"metadata": {},
"outputs": [],
@@ -241,6 +241,1105 @@
},
{
"cell_type": "code",
+ "execution_count": 12,
+ "id": "c70e8873",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['!',\n",
+ " '\"',\n",
+ " '&',\n",
+ " \"'\",\n",
+ " '(',\n",
+ " ')',\n",
+ " '*',\n",
+ " '+',\n",
+ " ',',\n",
+ " '-',\n",
+ " '.',\n",
+ " '/',\n",
+ " '0',\n",
+ " '1',\n",
+ " '2',\n",
+ " '3',\n",
+ " '4',\n",
+ " '5',\n",
+ " '6',\n",
+ " '7',\n",
+ " '8',\n",
+ " '9',\n",
+ " ':',\n",
+ " ';',\n",
+ " '?',\n",
+ " 'a',\n",
+ " 'b',\n",
+ " 'c',\n",
+ " 'd',\n",
+ " 'e',\n",
+ " 'f',\n",
+ " 'g',\n",
+ " 'h',\n",
+ " 'i',\n",
+ " 'j',\n",
+ " 'k',\n",
+ " 'l',\n",
+ " 'm',\n",
+ " 'n',\n",
+ " 'o',\n",
+ " 'p',\n",
+ " 'q',\n",
+ " 'r',\n",
+ " 's',\n",
+ " 't',\n",
+ " 'u',\n",
+ " 'v',\n",
+ " 'w',\n",
+ " 'x',\n",
+ " 'y',\n",
+ " 'z',\n",
+ " '▁',\n",
+ " '<s>',\n",
+ " '<e>',\n",
+ " '<p>',\n",
+ " '\\n']"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "processor.graphemes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "6def52f6",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['/',\n",
+ " '▁',\n",
+ " 's',\n",
+ " '▁the',\n",
+ " ',',\n",
+ " '▁.',\n",
+ " '▁a',\n",
+ " 'e',\n",
+ " '▁to',\n",
+ " 'ed',\n",
+ " '▁of',\n",
+ " '▁in',\n",
+ " 'ing',\n",
+ " 'y',\n",
+ " '▁and',\n",
+ " 't',\n",
+ " 'm',\n",
+ " 'd',\n",
+ " '▁be',\n",
+ " 'at',\n",
+ " 'a',\n",
+ " 'n',\n",
+ " '▁th',\n",
+ " 'i',\n",
+ " '▁he',\n",
+ " 'p',\n",
+ " 'o',\n",
+ " 'en',\n",
+ " 'al',\n",
+ " '▁is',\n",
+ " '-',\n",
+ " 'ly',\n",
+ " '.',\n",
+ " '▁w',\n",
+ " 'c',\n",
+ " '▁for',\n",
+ " 'l',\n",
+ " '\"',\n",
+ " '▁it',\n",
+ " 'er',\n",
+ " '▁was',\n",
+ " 'u',\n",
+ " '▁on',\n",
+ " '▁with',\n",
+ " \"'\",\n",
+ " '▁his',\n",
+ " 'g',\n",
+ " 'h',\n",
+ " '▁had',\n",
+ " '▁mr',\n",
+ " 'in',\n",
+ " '▁i',\n",
+ " 'or',\n",
+ " '▁f',\n",
+ " 'r',\n",
+ " \"▁'\",\n",
+ " '▁but',\n",
+ " '▁by',\n",
+ " 'ar',\n",
+ " '▁as',\n",
+ " '▁not',\n",
+ " 'b',\n",
+ " 'it',\n",
+ " 'ir',\n",
+ " 'v',\n",
+ " '▁re',\n",
+ " 'st',\n",
+ " '▁an',\n",
+ " '▁has',\n",
+ " '▁are',\n",
+ " 'this',\n",
+ " 'f',\n",
+ " 'on',\n",
+ " 'hich',\n",
+ " 'ce',\n",
+ " '▁who',\n",
+ " '▁have',\n",
+ " 'ion',\n",
+ " 'ch',\n",
+ " '▁p',\n",
+ " 're',\n",
+ " '▁at',\n",
+ " '▁de',\n",
+ " '▁m',\n",
+ " 'w',\n",
+ " 'le',\n",
+ " 'ri',\n",
+ " '▁so',\n",
+ " '▁fro',\n",
+ " 'ation',\n",
+ " 'ic',\n",
+ " '▁do',\n",
+ " 'ould',\n",
+ " '▁will',\n",
+ " '▁b',\n",
+ " 'k',\n",
+ " 'is',\n",
+ " 'ment',\n",
+ " 'il',\n",
+ " '▁st',\n",
+ " '▁no',\n",
+ " 'an',\n",
+ " 'ter',\n",
+ " 'se',\n",
+ " '▁we',\n",
+ " 'ur',\n",
+ " 'ies',\n",
+ " '▁un',\n",
+ " '▁her',\n",
+ " 'th',\n",
+ " 'es',\n",
+ " 'ent',\n",
+ " 'et',\n",
+ " '▁-',\n",
+ " '▁all',\n",
+ " '▁c',\n",
+ " '▁there',\n",
+ " 'un',\n",
+ " 've',\n",
+ " '▁can',\n",
+ " 'ne',\n",
+ " '▁up',\n",
+ " '▁one',\n",
+ " '▁were',\n",
+ " '▁me',\n",
+ " '▁br',\n",
+ " '▁com',\n",
+ " '▁more',\n",
+ " '▁se',\n",
+ " 'ro',\n",
+ " '▁per',\n",
+ " 'ld',\n",
+ " 'ge',\n",
+ " '▁man',\n",
+ " 'est',\n",
+ " '▁pa',\n",
+ " 'ul',\n",
+ " 'el',\n",
+ " 'if',\n",
+ " '▁government',\n",
+ " '▁or',\n",
+ " '▁she',\n",
+ " 'ill',\n",
+ " '▁co',\n",
+ " '▁year',\n",
+ " 'ad',\n",
+ " '▁bo',\n",
+ " 'te',\n",
+ " '▁than',\n",
+ " '▁r',\n",
+ " 'll',\n",
+ " 'able',\n",
+ " 'ous',\n",
+ " '▁what',\n",
+ " '▁out',\n",
+ " 'ck',\n",
+ " '!',\n",
+ " '▁new',\n",
+ " '▁dis',\n",
+ " 'lo',\n",
+ " 'ra',\n",
+ " 'uch',\n",
+ " '▁him',\n",
+ " 'age',\n",
+ " 'ct',\n",
+ " '▁go',\n",
+ " '▁con',\n",
+ " 'day',\n",
+ " 'z',\n",
+ " 'op',\n",
+ " 'ther',\n",
+ " '▁you',\n",
+ " '▁dr',\n",
+ " 'us',\n",
+ " '▁1',\n",
+ " '▁may',\n",
+ " 'ers',\n",
+ " ':',\n",
+ " '▁first',\n",
+ " 'li',\n",
+ " '0',\n",
+ " 'end',\n",
+ " 'ng',\n",
+ " '▁people',\n",
+ " '▁some',\n",
+ " 'our',\n",
+ " '▁anglesey',\n",
+ " '▁two',\n",
+ " '▁wh',\n",
+ " 'tion',\n",
+ " '▁lo',\n",
+ " '▁sir',\n",
+ " 'ig',\n",
+ " 'ain',\n",
+ " 'tic',\n",
+ " '▁now',\n",
+ " 'ive',\n",
+ " 'and',\n",
+ " '▁whe',\n",
+ " ';',\n",
+ " 'ry',\n",
+ " 'um',\n",
+ " '▁other',\n",
+ " '▁su',\n",
+ " '▁ho',\n",
+ " '▁ha',\n",
+ " '▁ra',\n",
+ " '▁even',\n",
+ " 'ish',\n",
+ " 'ard',\n",
+ " 'wn',\n",
+ " '▁li',\n",
+ " 'ten',\n",
+ " 'ok',\n",
+ " '▁k',\n",
+ " 'ver',\n",
+ " 'into',\n",
+ " '▁na',\n",
+ " '▁vi',\n",
+ " '▁po',\n",
+ " '▁common',\n",
+ " 'ight',\n",
+ " '▁over',\n",
+ " '▁like',\n",
+ " '▁mad',\n",
+ " 'ud',\n",
+ " 'ow',\n",
+ " '▁any',\n",
+ " '▁again',\n",
+ " '▁great',\n",
+ " '▁last',\n",
+ " 'ough',\n",
+ " 'ose',\n",
+ " 'ant',\n",
+ " '▁under',\n",
+ " 'ful',\n",
+ " 'self',\n",
+ " '▁how',\n",
+ " '▁sai',\n",
+ " 'x',\n",
+ " '▁most',\n",
+ " 'ine',\n",
+ " '▁give',\n",
+ " '▁about',\n",
+ " 'ance',\n",
+ " 'me',\n",
+ " 'fore',\n",
+ " 'cu',\n",
+ " '(',\n",
+ " '▁minister',\n",
+ " '▁labour',\n",
+ " '▁talk',\n",
+ " '▁work',\n",
+ " '▁take',\n",
+ " '▁writ',\n",
+ " '▁whil',\n",
+ " '▁well',\n",
+ " '▁plan',\n",
+ " '▁la',\n",
+ " '▁pri',\n",
+ " '▁play',\n",
+ " '▁house',\n",
+ " 'mber',\n",
+ " '▁ex',\n",
+ " '▁good',\n",
+ " 'ern',\n",
+ " 'ple',\n",
+ " 'ble',\n",
+ " '▁german',\n",
+ " 'ak',\n",
+ " 'ever',\n",
+ " '▁time',\n",
+ " '▁pro',\n",
+ " '▁ga',\n",
+ " '▁germany',\n",
+ " '▁britain',\n",
+ " '▁should',\n",
+ " '▁polic',\n",
+ " '▁found',\n",
+ " '▁part',\n",
+ " 'ood',\n",
+ " '▁g',\n",
+ " '▁west',\n",
+ " 'thing',\n",
+ " '▁cur',\n",
+ " '▁know',\n",
+ " '▁back',\n",
+ " '▁ma',\n",
+ " '▁show',\n",
+ " ')',\n",
+ " 'ure',\n",
+ " 'ian',\n",
+ " 'app',\n",
+ " '▁ever',\n",
+ " '▁use',\n",
+ " 'ally',\n",
+ " 'led',\n",
+ " '▁miss',\n",
+ " '▁car',\n",
+ " '▁nation',\n",
+ " 'ttle',\n",
+ " '▁end',\n",
+ " '▁gr',\n",
+ " '▁must',\n",
+ " '▁tr',\n",
+ " '▁du',\n",
+ " '▁get',\n",
+ " '▁unit',\n",
+ " 'less',\n",
+ " '▁agree',\n",
+ " 'ie',\n",
+ " 'tom',\n",
+ " '..',\n",
+ " '▁di',\n",
+ " '▁men',\n",
+ " '4',\n",
+ " '▁my',\n",
+ " '▁today',\n",
+ " 'ves',\n",
+ " 'hap',\n",
+ " 'tory',\n",
+ " 'ence',\n",
+ " 'out',\n",
+ " '▁its',\n",
+ " '8',\n",
+ " '▁believe',\n",
+ " '▁say',\n",
+ " '▁make',\n",
+ " 'for',\n",
+ " '▁them',\n",
+ " '▁president',\n",
+ " '▁return',\n",
+ " '▁british',\n",
+ " '▁after',\n",
+ " '▁earl',\n",
+ " '▁here',\n",
+ " 'ton',\n",
+ " 'ext',\n",
+ " '▁life',\n",
+ " '▁put',\n",
+ " '▁war',\n",
+ " '▁2',\n",
+ " '▁cont',\n",
+ " '▁every',\n",
+ " '▁god',\n",
+ " '▁strong',\n",
+ " 'ren',\n",
+ " 'ty',\n",
+ " '▁mon',\n",
+ " 'man',\n",
+ " 'very',\n",
+ " '5',\n",
+ " '6',\n",
+ " '▁yester',\n",
+ " '▁week',\n",
+ " '▁film',\n",
+ " 'ual',\n",
+ " '▁need',\n",
+ " 'lso',\n",
+ " 'aid',\n",
+ " 'law',\n",
+ " 'ward',\n",
+ " '▁ask',\n",
+ " 'bo',\n",
+ " '▁pre',\n",
+ " '▁conference',\n",
+ " '▁real',\n",
+ " '▁roy',\n",
+ " 'ince',\n",
+ " '▁us',\n",
+ " 'ke',\n",
+ " '▁franc',\n",
+ " '▁direct',\n",
+ " '▁child',\n",
+ " 'cause',\n",
+ " '▁long',\n",
+ " '▁action',\n",
+ " '▁hope',\n",
+ " '▁live',\n",
+ " '▁see',\n",
+ " '▁point',\n",
+ " '▁char',\n",
+ " '▁way',\n",
+ " 'val',\n",
+ " '▁add',\n",
+ " '▁ba',\n",
+ " 'tt',\n",
+ " 'qui',\n",
+ " 'ism',\n",
+ " 'other',\n",
+ " '▁character',\n",
+ " '&',\n",
+ " '▁kennedy',\n",
+ " '▁macmillan',\n",
+ " '▁service',\n",
+ " '▁open',\n",
+ " '▁tri',\n",
+ " '▁country',\n",
+ " 'orn',\n",
+ " 'abl',\n",
+ " '▁off',\n",
+ " '▁fac',\n",
+ " '▁effective',\n",
+ " 'rote',\n",
+ " '▁let',\n",
+ " 'each',\n",
+ " '▁cent',\n",
+ " '▁party',\n",
+ " '▁act',\n",
+ " '▁prim',\n",
+ " '▁str',\n",
+ " 'amp',\n",
+ " '▁3',\n",
+ " '▁describe',\n",
+ " '▁europe',\n",
+ " '▁through',\n",
+ " '▁medic',\n",
+ " '▁home',\n",
+ " 'ug',\n",
+ " '▁african',\n",
+ " 'ind',\n",
+ " '▁prop',\n",
+ " 'by',\n",
+ " '▁thre',\n",
+ " 'ndon',\n",
+ " 'ative',\n",
+ " '▁sit',\n",
+ " ',000',\n",
+ " '▁high',\n",
+ " '▁secret',\n",
+ " '▁night',\n",
+ " '▁fr',\n",
+ " 'health',\n",
+ " '▁call',\n",
+ " '▁north',\n",
+ " '▁mean',\n",
+ " '▁seem',\n",
+ " '▁lea',\n",
+ " 'tire',\n",
+ " 'lway',\n",
+ " 'ness',\n",
+ " '▁rhodesia',\n",
+ " '▁195',\n",
+ " '▁hand',\n",
+ " '▁federa',\n",
+ " '▁place',\n",
+ " 'round',\n",
+ " '▁far',\n",
+ " 'ship',\n",
+ " 'low',\n",
+ " '▁exa',\n",
+ " 'ip',\n",
+ " '▁change',\n",
+ " '▁patient',\n",
+ " '▁lord',\n",
+ " '▁arriv',\n",
+ " '▁trem',\n",
+ " '▁pear',\n",
+ " 'ring',\n",
+ " '▁near',\n",
+ " 'land',\n",
+ " 'ress',\n",
+ " '▁clo',\n",
+ " '▁did',\n",
+ " 'later',\n",
+ " 'ius',\n",
+ " 'tho',\n",
+ " 'well',\n",
+ " '7',\n",
+ " '▁enjoy',\n",
+ " '▁success',\n",
+ " '▁appear',\n",
+ " '▁personal',\n",
+ " '▁move',\n",
+ " '▁treat',\n",
+ " '▁better',\n",
+ " 'case',\n",
+ " '▁never',\n",
+ " '▁look',\n",
+ " 'ture',\n",
+ " 'way',\n",
+ " 'mark',\n",
+ " 'ought',\n",
+ " 'tu',\n",
+ " '▁hu',\n",
+ " 'twe',\n",
+ " '▁foreign',\n",
+ " '▁number',\n",
+ " '▁fall',\n",
+ " '▁month',\n",
+ " '▁young',\n",
+ " 'ound',\n",
+ " '▁find',\n",
+ " '▁american',\n",
+ " '▁thought',\n",
+ " '▁state',\n",
+ " 'ilit',\n",
+ " '▁cit',\n",
+ " '▁side',\n",
+ " '▁centr',\n",
+ " 'lace',\n",
+ " 'ash',\n",
+ " '▁old',\n",
+ " 'rome',\n",
+ " '▁subject',\n",
+ " '▁support',\n",
+ " '▁world',\n",
+ " '▁word',\n",
+ " 'different',\n",
+ " '▁report',\n",
+ " '1',\n",
+ " '▁tru',\n",
+ " '▁sent',\n",
+ " '▁best',\n",
+ " '▁symp',\n",
+ " '▁just',\n",
+ " '▁richards',\n",
+ " '▁produc',\n",
+ " '▁left',\n",
+ " '▁clear',\n",
+ " '▁power',\n",
+ " 'ber',\n",
+ " '▁few',\n",
+ " 'solut',\n",
+ " 'tack',\n",
+ " '▁right',\n",
+ " '▁mar',\n",
+ " '▁might',\n",
+ " '3',\n",
+ " 'not',\n",
+ " '▁full',\n",
+ " '▁union',\n",
+ " '▁base',\n",
+ " '▁head',\n",
+ " 'ealth',\n",
+ " 'many',\n",
+ " '▁happen',\n",
+ " '▁mauro',\n",
+ " 'vision',\n",
+ " '▁inter',\n",
+ " '▁ye',\n",
+ " 'where',\n",
+ " '▁sail',\n",
+ " '▁past',\n",
+ " '▁your',\n",
+ " '▁want',\n",
+ " '▁stor',\n",
+ " 'went',\n",
+ " '▁eas',\n",
+ " '▁rem',\n",
+ " '▁nuclear',\n",
+ " '▁small',\n",
+ " '▁discuss',\n",
+ " 'spect',\n",
+ " '▁girl',\n",
+ " '▁view',\n",
+ " '▁south',\n",
+ " '▁mod',\n",
+ " 'elect',\n",
+ " '▁usual',\n",
+ " 'evi',\n",
+ " 'peration',\n",
+ " '▁engl',\n",
+ " 'train',\n",
+ " '▁mind',\n",
+ " 'acle',\n",
+ " 'riv',\n",
+ " 'side',\n",
+ " 'aris',\n",
+ " '▁event',\n",
+ " '▁lei',\n",
+ " '▁favour',\n",
+ " '▁problem',\n",
+ " '▁large',\n",
+ " 'ordina',\n",
+ " '▁feel',\n",
+ " 'search',\n",
+ " '▁wife',\n",
+ " '▁remain',\n",
+ " 'ready',\n",
+ " 'stone',\n",
+ " '▁cla',\n",
+ " '▁stand',\n",
+ " '▁big',\n",
+ " '▁bus',\n",
+ " 'ox',\n",
+ " '▁four',\n",
+ " 'port',\n",
+ " '▁main',\n",
+ " '▁nor',\n",
+ " '▁va',\n",
+ " 'mann',\n",
+ " '▁pot',\n",
+ " 'rry',\n",
+ " '▁fan',\n",
+ " '▁fig',\n",
+ " '▁attempt',\n",
+ " '▁follow',\n",
+ " '▁particular',\n",
+ " '▁church',\n",
+ " '▁wonder',\n",
+ " '▁doctor',\n",
+ " '▁control',\n",
+ " '▁half',\n",
+ " '▁politic',\n",
+ " 'ciple',\n",
+ " '▁trans',\n",
+ " '▁extra',\n",
+ " '▁start',\n",
+ " '▁death',\n",
+ " '▁tell',\n",
+ " '▁eye',\n",
+ " '▁own',\n",
+ " '▁allow',\n",
+ " '▁offer',\n",
+ " '▁quit',\n",
+ " '▁easi',\n",
+ " 'ular',\n",
+ " '▁che',\n",
+ " '▁cri',\n",
+ " 'bit',\n",
+ " '▁bar',\n",
+ " 'ption',\n",
+ " 'rate',\n",
+ " '▁system',\n",
+ " 'crochet',\n",
+ " '▁certain',\n",
+ " '▁taste',\n",
+ " '▁method',\n",
+ " '▁course',\n",
+ " '▁leader',\n",
+ " 'million',\n",
+ " '▁19',\n",
+ " '▁fu',\n",
+ " '▁help',\n",
+ " 'lady',\n",
+ " 'super',\n",
+ " 'date',\n",
+ " '▁fact',\n",
+ " '▁amo',\n",
+ " '▁sho',\n",
+ " 'nson',\n",
+ " 'lin',\n",
+ " '2',\n",
+ " '▁advi',\n",
+ " 'mitt',\n",
+ " '▁line',\n",
+ " 'effect',\n",
+ " '▁school',\n",
+ " '▁doubt',\n",
+ " '▁consult',\n",
+ " '▁answer',\n",
+ " 'special',\n",
+ " '▁burde',\n",
+ " '▁room',\n",
+ " '▁probabl',\n",
+ " '▁delight',\n",
+ " '▁bank',\n",
+ " '▁six',\n",
+ " '▁brown',\n",
+ " '▁studie',\n",
+ " '▁human',\n",
+ " '▁committee',\n",
+ " 'cial',\n",
+ " '▁creat',\n",
+ " '▁distin',\n",
+ " '▁cut',\n",
+ " '▁mother',\n",
+ " 'morr',\n",
+ " 'ground',\n",
+ " '▁free',\n",
+ " '▁five',\n",
+ " '▁arm',\n",
+ " '▁pol',\n",
+ " 'done',\n",
+ " '▁late',\n",
+ " '▁basi',\n",
+ " 'nato',\n",
+ " '9',\n",
+ " '▁econom',\n",
+ " '▁opposit',\n",
+ " '▁public',\n",
+ " '▁chief',\n",
+ " '▁become',\n",
+ " '▁record',\n",
+ " '▁peace',\n",
+ " '▁consider',\n",
+ " '▁understand',\n",
+ " '▁numer',\n",
+ " '▁slight',\n",
+ " '▁turn',\n",
+ " '▁duke',\n",
+ " '▁homo',\n",
+ " '▁grow',\n",
+ " '▁grant',\n",
+ " '▁office',\n",
+ " 'msel',\n",
+ " 'ainful',\n",
+ " '▁famil',\n",
+ " '▁ob',\n",
+ " '▁advise',\n",
+ " '▁cr',\n",
+ " '▁love',\n",
+ " 'ches',\n",
+ " '▁tea',\n",
+ " 'tist',\n",
+ " 'ivate',\n",
+ " '▁america',\n",
+ " 'aff',\n",
+ " 'author',\n",
+ " '▁gaitskell',\n",
+ " '▁industry',\n",
+ " '▁john',\n",
+ " '▁mistake',\n",
+ " '▁pleasant',\n",
+ " '▁temperature',\n",
+ " '▁difficult',\n",
+ " '▁brief',\n",
+ " '▁defence',\n",
+ " '▁gentle',\n",
+ " '▁piece',\n",
+ " '▁argu',\n",
+ " '▁irrita',\n",
+ " '▁berlin',\n",
+ " '▁father',\n",
+ " '▁cult',\n",
+ " '▁despa',\n",
+ " '▁market',\n",
+ " '▁white',\n",
+ " '▁negotiation',\n",
+ " '▁energ',\n",
+ " '▁fear',\n",
+ " '▁english',\n",
+ " '▁tele',\n",
+ " '▁interest',\n",
+ " 'same',\n",
+ " 'body',\n",
+ " '▁chap',\n",
+ " '▁table',\n",
+ " '▁mass',\n",
+ " '▁press',\n",
+ " '▁spi',\n",
+ " 'lay',\n",
+ " 'hall',\n",
+ " 'most',\n",
+ " '▁journ',\n",
+ " '▁diploma',\n",
+ " '▁expect',\n",
+ " '▁1960',\n",
+ " '▁join',\n",
+ " '▁provide',\n",
+ " '▁russell',\n",
+ " '▁speak',\n",
+ " '▁decide',\n",
+ " '▁complete',\n",
+ " '▁senator',\n",
+ " '▁tax',\n",
+ " '▁earth',\n",
+ " '▁colour',\n",
+ " 'armament',\n",
+ " '▁sevent',\n",
+ " '▁learn',\n",
+ " '▁honey',\n",
+ " '▁militar',\n",
+ " '▁claim',\n",
+ " '▁chamb',\n",
+ " '▁master',\n",
+ " '▁peel',\n",
+ " '▁firm',\n",
+ " 'eason',\n",
+ " '▁trade',\n",
+ " '▁washing',\n",
+ " '▁statement',\n",
+ " 'range',\n",
+ " '▁rol',\n",
+ " 'guis',\n",
+ " 'hei',\n",
+ " 'ago',\n",
+ " '▁pla',\n",
+ " '▁cam',\n",
+ " 'eopa',\n",
+ " '▁africa',\n",
+ " '▁convention',\n",
+ " '▁council',\n",
+ " '▁decision',\n",
+ " '▁experience',\n",
+ " '▁increasing',\n",
+ " '▁majorit',\n",
+ " '▁suffer',\n",
+ " '▁suggest',\n",
+ " '▁position',\n",
+ " '▁resource',\n",
+ " '▁soldier',\n",
+ " 'temper',\n",
+ " '▁general',\n",
+ " '▁financi',\n",
+ " '▁express',\n",
+ " '▁short',\n",
+ " '▁compan',\n",
+ " 'field',\n",
+ " '▁think',\n",
+ " 'cross',\n",
+ " '▁odd',\n",
+ " '▁1830',\n",
+ " '▁reason',\n",
+ " '▁hard',\n",
+ " '▁fright',\n",
+ " '▁final',\n",
+ " '▁thread',\n",
+ " 'heart',\n",
+ " '▁nea',\n",
+ " '▁assu',\n",
+ " '▁step',\n",
+ " '▁countr',\n",
+ " 'did',\n",
+ " '▁infa',\n",
+ " 'struct',\n",
+ " '▁question',\n",
+ " 'advantage',\n",
+ " '▁atmospher',\n",
+ " '▁screw',\n",
+ " '▁square',\n",
+ " '▁welensky',\n",
+ " '▁cinema',\n",
+ " '▁weapon',\n",
+ " '▁importan',\n",
+ " '▁apart',\n",
+ " '▁prepar',\n",
+ " '▁payment',\n",
+ " '▁present',\n",
+ " '▁persever',\n",
+ " 'simpl',\n",
+ " '▁idea',\n",
+ " '▁expl',\n",
+ " '▁chance',\n",
+ " '▁fast',\n",
+ " '▁notice',\n",
+ " 'rade',\n",
+ " '▁giv',\n",
+ " '▁leave',\n",
+ " 'finger',\n",
+ " 'eppe',\n",
+ " '▁gla',\n",
+ " '▁psycholog',\n",
+ " '▁robertson',\n",
+ " '▁accept',\n",
+ " '▁improv',\n",
+ " '▁philip',\n",
+ " '▁technical',\n",
+ " '▁trouble',\n",
+ " '▁premier',\n",
+ " '▁progress',\n",
+ " 'minute',\n",
+ " '▁scot',\n",
+ " '▁couple',\n",
+ " '▁concern',\n",
+ " '▁schem',\n",
+ " '▁weaver',\n",
+ " '▁inform',\n",
+ " '▁cap',\n",
+ " 'stead',\n",
+ " '▁heav',\n",
+ " '▁propos',\n",
+ " '▁order',\n",
+ " 'ensure',\n",
+ " 'worth',\n",
+ " '▁former',\n",
+ " '▁sense',\n",
+ " '▁clu',\n",
+ " 'nder',\n",
+ " 'easing',\n",
+ " '▁possib',\n",
+ " 'ulate',\n",
+ " 'polit',\n",
+ " 'equa',\n",
+ " 'communist',\n",
+ " '▁adenauer',\n",
+ " '▁delaney',\n",
+ " '▁demonstrat',\n",
+ " '▁independen',\n",
+ " '▁welcome',\n",
+ " '▁speech',\n",
+ " '▁includ',\n",
+ " '▁oppose',\n",
+ " 'timate',\n",
+ " '▁picture',\n",
+ " '▁share',\n",
+ " '▁result',\n",
+ " '▁receiv',\n",
+ " '▁garden',\n",
+ " '▁appeal',\n",
+ " '▁chair',\n",
+ " '▁keep',\n",
+ " '▁regard',\n",
+ " '▁chines',\n",
+ " '▁sister',\n",
+ " '▁prince',\n",
+ " '▁scene',\n",
+ " '▁spend',\n",
+ " '▁congo',\n",
+ " '▁issue',\n",
+ " '▁authorit',\n",
+ " '▁memor',\n",
+ " '▁scar',\n",
+ " '▁moral',\n",
+ " '▁matter',\n",
+ " 'aught',\n",
+ " '▁wait',\n",
+ " 'street',\n",
+ " '▁concept',\n",
+ " '▁destructi',\n",
+ " 'sol',\n",
+ " 'committe',\n",
+ " 'house',\n",
+ " 'older',\n",
+ " 'ference',\n",
+ " '▁adjust',\n",
+ " '▁associat',\n",
+ " '▁campaign',\n",
+ " '▁conservative',\n",
+ " '▁immediate',\n",
+ " '▁industri',\n",
+ " '▁james',\n",
+ " '▁lucas',\n",
+ " '▁practic',\n",
+ " '▁publish',\n",
+ " '▁purpose',\n",
+ " '▁queer',\n",
+ " '▁representative',\n",
+ " '▁relief',\n",
+ " '▁detail',\n",
+ " '▁official',\n",
+ " '▁faith',\n",
+ " '▁design',\n",
+ " '▁sacr',\n",
+ " '▁household',\n",
+ " '▁variet',\n",
+ " '▁village',\n",
+ " '▁foot',\n",
+ " '▁russia',\n",
+ " '▁fifty',\n",
+ " '▁block',\n",
+ " '▁inevitab',\n",
+ " '▁prevent',\n",
+ " '▁belgi',\n",
+ " '▁type',\n",
+ " '▁cheap',\n",
+ " '▁protest',\n",
+ " '▁along',\n",
+ " '▁demand',\n",
+ " '▁earn',\n",
+ " '▁sick',\n",
+ " '▁eviden',\n",
+ " '▁someone',\n",
+ " '▁peers',\n",
+ " '▁suitabl',\n",
+ " '▁sympath',\n",
+ " 'brid',\n",
+ " '▁discr',\n",
+ " 'flu',\n",
+ " 'bomb',\n",
+ " 'rator',\n",
+ " 'coming',\n",
+ " 'famili',\n",
+ " '▁183',\n",
+ " 'employ',\n",
+ " '▁catherine',\n",
+ " '▁correspond',\n",
+ " '▁material',\n",
+ " '▁rubb',\n",
+ " '▁measure',\n",
+ " '▁sharp',\n",
+ " '▁wicki',\n",
+ " '▁marriage',\n",
+ " '▁size',\n",
+ " '▁liberal',\n",
+ " '▁level',\n",
+ " '▁surround',\n",
+ " '▁delegat',\n",
+ " 'script',\n",
+ " '▁suppl',\n",
+ " '▁reserve',\n",
+ " '▁refus',\n",
+ " '▁gift',\n",
+ " '▁promis',\n",
+ " '▁safe',\n",
+ " '▁exist',\n",
+ " '▁alto',\n",
+ " '▁wide',\n",
+ " '▁arrang',\n",
+ " '▁relations',\n",
+ " 'shab',\n",
+ " '▁intelligen',\n",
+ " 'emphasi',\n",
+ " '▁advertisement',\n",
+ " '▁beautiful',\n",
+ " '+',\n",
+ " '?',\n",
+ " 'q',\n",
+ " 'j',\n",
+ " ...]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "processor.tokens"
+ ]
+ },
+ {
+ "cell_type": "code",
"execution_count": 8,
"id": "fb0afccf",
"metadata": {},
@@ -262,6 +1361,1027 @@
},
{
"cell_type": "code",
+ "execution_count": 14,
+ "id": "fb600959",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'!': ['▁', '!'],\n",
+ " '\"': ['▁', '\"'],\n",
+ " '&': ['▁', '&'],\n",
+ " \"'\": [\"▁'\"],\n",
+ " \"'30s\": [\"▁'\", '3', '0', 's'],\n",
+ " \"'61\": [\"▁'\", '6', '1'],\n",
+ " \"'d\": [\"▁'\", 'd'],\n",
+ " \"'ll\": [\"▁'\", 'll'],\n",
+ " \"'m\": [\"▁'\", 'm'],\n",
+ " \"'re\": [\"▁'\", 're'],\n",
+ " \"'s\": [\"▁'\", 's'],\n",
+ " \"'ve\": [\"▁'\", 've'],\n",
+ " '(': ['▁', '('],\n",
+ " ')': ['▁', ')'],\n",
+ " '+2.8': ['▁', '+', '2', '.', '8'],\n",
+ " '+3.6': ['▁', '+', '3', '.', '6'],\n",
+ " ',': ['▁', ','],\n",
+ " '-': ['▁-'],\n",
+ " '-2.6': ['▁-', '2', '.', '6'],\n",
+ " '-5.4': ['▁-', '5', '.', '4'],\n",
+ " '.': ['▁.'],\n",
+ " '...': ['▁.', '..'],\n",
+ " '0m': ['▁', '0', 'm'],\n",
+ " '1': ['▁1'],\n",
+ " '1,157': ['▁1', ',', '1', '5', '7'],\n",
+ " '1.8': ['▁1', '.', '8'],\n",
+ " '1/2': ['▁1', '/', '2'],\n",
+ " '1/2-in.-long': ['▁1', '/', '2', '-', 'in', '.', '-', 'lo', 'ng'],\n",
+ " '1/4': ['▁1', '/', '4'],\n",
+ " '10': ['▁1', '0'],\n",
+ " '10,000': ['▁1', '0', ',000'],\n",
+ " '100': ['▁1', '0', '0'],\n",
+ " '100,000,000': ['▁1', '0', '0', ',000', ',000'],\n",
+ " '11': ['▁1', '1'],\n",
+ " '12': ['▁1', '2'],\n",
+ " '125': ['▁1', '2', '5'],\n",
+ " '13': ['▁1', '3'],\n",
+ " '13,000': ['▁1', '3', ',000'],\n",
+ " '14': ['▁1', '4'],\n",
+ " '15': ['▁1', '5'],\n",
+ " '15,000,000': ['▁1', '5', ',000', ',000'],\n",
+ " '15-17': ['▁1', '5', '-', '1', '7'],\n",
+ " '15-year-olds': ['▁1', '5', '-', 'y', 'e', 'ar', '-', 'o', 'ld', 's'],\n",
+ " '150,000,000': ['▁1', '5', '0', ',000', ',000'],\n",
+ " '16': ['▁1', '6'],\n",
+ " '160': ['▁1', '6', '0'],\n",
+ " '163,000,000': ['▁1', '6', '3', ',000', ',000'],\n",
+ " '167': ['▁1', '6', '7'],\n",
+ " '17': ['▁1', '7'],\n",
+ " '18.1': ['▁1', '8', '.', '1'],\n",
+ " '1830': ['▁1830'],\n",
+ " \"1830's\": ['▁1830', \"'\", 's'],\n",
+ " '1834': ['▁183', '4'],\n",
+ " '1897': ['▁1', '8', '9', '7'],\n",
+ " '19': ['▁19'],\n",
+ " '19.5': ['▁19', '.', '5'],\n",
+ " '1910': ['▁19', '1', '0'],\n",
+ " '1913': ['▁19', '1', '3'],\n",
+ " '1914': ['▁19', '1', '4'],\n",
+ " '1914-18': ['▁19', '1', '4', '-', '1', '8'],\n",
+ " '1920': ['▁19', '2', '0'],\n",
+ " '1931': ['▁19', '3', '1'],\n",
+ " '1932': ['▁19', '3', '2'],\n",
+ " '1934': ['▁19', '3', '4'],\n",
+ " '1939': ['▁19', '3', '9'],\n",
+ " '1943': ['▁19', '4', '3'],\n",
+ " '1944': ['▁19', '4', '4'],\n",
+ " '1950': ['▁195', '0'],\n",
+ " '1951': ['▁195', '1'],\n",
+ " '1954': ['▁195', '4'],\n",
+ " '1956': ['▁195', '6'],\n",
+ " '1957': ['▁195', '7'],\n",
+ " '1958': ['▁195', '8'],\n",
+ " '1959': ['▁195', '9'],\n",
+ " '1960': ['▁1960'],\n",
+ " '1960s': ['▁1960', 's'],\n",
+ " '1961': ['▁19', '6', '1'],\n",
+ " '1963': ['▁19', '6', '3'],\n",
+ " '19th': ['▁19', 'th'],\n",
+ " '1superceded': ['▁1', 'super', 'c', 'ed', 'ed'],\n",
+ " \"1tho'\": ['▁1', 'tho', \"'\"],\n",
+ " '2': ['▁2'],\n",
+ " '2,000': ['▁2', ',000'],\n",
+ " '2,415,000,000': ['▁2', ',', '4', '1', '5', ',000', ',000'],\n",
+ " '20': ['▁2', '0'],\n",
+ " '20-month-old': ['▁2', '0', '-', 'm', 'on', 'th', '-', 'o', 'ld'],\n",
+ " '200': ['▁2', '0', '0'],\n",
+ " '20th-century': ['▁2', '0', 'th', '-', 'c', 'ent', 'ur', 'y'],\n",
+ " '210million': ['▁2', '1', '0', 'million'],\n",
+ " '22': ['▁2', '2'],\n",
+ " '23.1': ['▁2', '3', '.', '1'],\n",
+ " '24-strong': ['▁2', '4', '-', 'st', 'ro', 'ng'],\n",
+ " '25': ['▁2', '5'],\n",
+ " '27': ['▁2', '7'],\n",
+ " '28.5': ['▁2', '8', '.', '5'],\n",
+ " '280,000': ['▁2', '8', '0', ',000'],\n",
+ " '287': ['▁2', '8', '7'],\n",
+ " '288': ['▁2', '8', '8'],\n",
+ " '2bhoys': ['▁2', 'b', 'h', 'o', 'y', 's'],\n",
+ " '2ole': ['▁2', 'o', 'le'],\n",
+ " '2skint': ['▁2', 's', 'k', 'in', 't'],\n",
+ " '3': ['▁3'],\n",
+ " '3,000': ['▁3', ',000'],\n",
+ " '3.6': ['▁3', '.', '6'],\n",
+ " '3/0': ['▁3', '/', '0'],\n",
+ " '30': ['▁3', '0'],\n",
+ " '30-minute': ['▁3', '0', '-', 'minute'],\n",
+ " '300,000': ['▁3', '0', '0', ',000'],\n",
+ " '32': ['▁3', '2'],\n",
+ " '34': ['▁3', '4'],\n",
+ " '35': ['▁3', '5'],\n",
+ " '357million': ['▁3', '5', '7', 'million'],\n",
+ " '36': ['▁3', '6'],\n",
+ " '37,000,000': ['▁3', '7', ',000', ',000'],\n",
+ " '37.2': ['▁3', '7', '.', '2'],\n",
+ " '38': ['▁3', '8'],\n",
+ " '4': ['▁', '4'],\n",
+ " '4.8': ['▁', '4', '.', '8'],\n",
+ " '40': ['▁', '4', '0'],\n",
+ " '400': ['▁', '4', '0', '0'],\n",
+ " '400,000': ['▁', '4', '0', '0', ',000'],\n",
+ " '43': ['▁', '4', '3'],\n",
+ " '450': ['▁', '4', '5', '0'],\n",
+ " '5': ['▁', '5'],\n",
+ " '5,000': ['▁', '5', ',000'],\n",
+ " '5.30': ['▁', '5', '.', '3', '0'],\n",
+ " '5/8': ['▁', '5', '/', '8'],\n",
+ " '50,000': ['▁', '5', '0', ',000'],\n",
+ " '500': ['▁', '5', '0', '0'],\n",
+ " '550,000': ['▁', '5', '5', '0', ',000'],\n",
+ " '58': ['▁', '5', '8'],\n",
+ " '6': ['▁', '6'],\n",
+ " '6,000': ['▁', '6', ',000'],\n",
+ " '60': ['▁', '6', '0'],\n",
+ " '600': ['▁', '6', '0', '0'],\n",
+ " '61-year-old': ['▁', '6', '1', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n",
+ " '6al': ['▁', '6', 'al'],\n",
+ " '6tic': ['▁', '6', 'tic'],\n",
+ " '7.42': ['▁', '7', '.', '4', '2'],\n",
+ " '70': ['▁', '7', '0'],\n",
+ " '70,000,000': ['▁', '7', '0', ',000', ',000'],\n",
+ " '707': ['▁', '7', '0', '7'],\n",
+ " '750': ['▁', '7', '5', '0'],\n",
+ " '8': ['▁', '8'],\n",
+ " '80': ['▁', '8', '0'],\n",
+ " '800': ['▁', '8', '0', '0'],\n",
+ " '800,000': ['▁', '8', '0', '0', ',000'],\n",
+ " '86': ['▁', '8', '6'],\n",
+ " '88': ['▁', '8', '8'],\n",
+ " '88-year-old': ['▁', '8', '8', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n",
+ " '89': ['▁', '8', '9'],\n",
+ " '89-year-old': ['▁', '8', '9', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n",
+ " '90-day': ['▁', '9', '0', '-', 'day'],\n",
+ " '90-minute': ['▁', '9', '0', '-', 'minute'],\n",
+ " '91': ['▁', '9', '1'],\n",
+ " '950': ['▁', '9', '5', '0'],\n",
+ " '97.5': ['▁', '9', '7', '.', '5'],\n",
+ " ':': ['▁', ':'],\n",
+ " ';': ['▁', ';'],\n",
+ " '?': ['▁', '?'],\n",
+ " 'a': ['▁a'],\n",
+ " 'abandon': ['▁a', 'b', 'a', 'ndon'],\n",
+ " 'abandoned': ['▁a', 'b', 'a', 'ndon', 'ed'],\n",
+ " 'abashed': ['▁a', 'b', 'ash', 'ed'],\n",
+ " 'ability': ['▁a', 'b', 'ilit', 'y'],\n",
+ " 'able': ['▁', 'able'],\n",
+ " 'able-bodied': ['▁', 'able', '-', 'bo', 'd', 'i', 'ed'],\n",
+ " 'abolish': ['▁a', 'bo', 'l', 'ish'],\n",
+ " 'abolished': ['▁a', 'bo', 'l', 'ish', 'ed'],\n",
+ " 'abolition': ['▁a', 'bo', 'li', 'tion'],\n",
+ " 'abou': ['▁a', 'bo', 'u'],\n",
+ " 'about': ['▁about'],\n",
+ " 'about-': ['▁about', '-'],\n",
+ " 'above': ['▁a', 'bo', 've'],\n",
+ " 'abreast': ['▁a', 'b', 're', 'a', 'st'],\n",
+ " 'abroad': ['▁a', 'b', 'ro', 'ad'],\n",
+ " 'absence': ['▁a', 'b', 's', 'ence'],\n",
+ " 'absent': ['▁a', 'b', 's', 'ent'],\n",
+ " 'absolutely': ['▁a', 'b', 'solut', 'e', 'ly'],\n",
+ " 'abstraction': ['▁a', 'b', 'st', 'ra', 'c', 'tion'],\n",
+ " 'abundance': ['▁a', 'b', 'un', 'd', 'ance'],\n",
+ " 'ac-': ['▁a', 'c', '-'],\n",
+ " 'academic': ['▁a', 'c', 'ad', 'e', 'm', 'ic'],\n",
+ " 'accent': ['▁a', 'c', 'c', 'ent'],\n",
+ " 'accents': ['▁a', 'c', 'c', 'ent', 's'],\n",
+ " 'accept': ['▁accept'],\n",
+ " 'acceptable': ['▁accept', 'able'],\n",
+ " 'accepted': ['▁accept', 'ed'],\n",
+ " 'accepting': ['▁accept', 'ing'],\n",
+ " 'accessories': ['▁a', 'c', 'ce', 's', 's', 'or', 'ies'],\n",
+ " 'accident': ['▁a', 'c', 'c', 'i', 'd', 'ent'],\n",
+ " 'accidental': ['▁a', 'c', 'c', 'i', 'd', 'ent', 'al'],\n",
+ " 'accommodate': ['▁a', 'c', 'c', 'o', 'm', 'm', 'o', 'date'],\n",
+ " 'accommodation': ['▁a', 'c', 'c', 'o', 'm', 'm', 'o', 'd', 'ation'],\n",
+ " 'accompanied': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'i', 'ed'],\n",
+ " 'accompanist': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'is', 't'],\n",
+ " 'accompany': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'y'],\n",
+ " 'accomplished': ['▁a', 'c', 'c', 'o', 'm', 'p', 'l', 'ish', 'ed'],\n",
+ " 'accomplishments': ['▁a', 'c', 'c', 'o', 'm', 'p', 'l', 'ish', 'ment', 's'],\n",
+ " 'according': ['▁a', 'c', 'c', 'or', 'd', 'ing'],\n",
+ " 'account': ['▁a', 'c', 'c', 'o', 'un', 't'],\n",
+ " 'accountancy': ['▁a', 'c', 'c', 'o', 'un', 't', 'an', 'c', 'y'],\n",
+ " 'accra': ['▁a', 'c', 'c', 'ra'],\n",
+ " \"accra's\": ['▁a', 'c', 'c', 'ra', \"'\", 's'],\n",
+ " 'accuracy': ['▁a', 'c', 'cu', 'ra', 'c', 'y'],\n",
+ " 'accurate': ['▁a', 'c', 'cu', 'rate'],\n",
+ " 'accurately': ['▁a', 'c', 'cu', 'rate', 'ly'],\n",
+ " 'accused': ['▁a', 'c', 'cu', 's', 'ed'],\n",
+ " 'achieved': ['▁a', 'ch', 'ie', 'v', 'ed'],\n",
+ " 'achievement': ['▁a', 'ch', 'ie', 've', 'ment'],\n",
+ " 'acquaintance': ['▁a', 'c', 'q', 'u', 'ain', 't', 'ance'],\n",
+ " 'acquaintances': ['▁a', 'c', 'q', 'u', 'ain', 't', 'ance', 's'],\n",
+ " 'acres': ['▁a', 'c', 're', 's'],\n",
+ " 'across': ['▁a', 'cross'],\n",
+ " 'act': ['▁act'],\n",
+ " 'acting': ['▁act', 'ing'],\n",
+ " 'action': ['▁action'],\n",
+ " 'actions': ['▁action', 's'],\n",
+ " 'active': ['▁act', 'ive'],\n",
+ " 'activists': ['▁act', 'i', 'v', 'is', 't', 's'],\n",
+ " 'activities': ['▁act', 'i', 'v', 'it', 'ies'],\n",
+ " 'activity': ['▁act', 'i', 'v', 'it', 'y'],\n",
+ " 'acton': ['▁act', 'on'],\n",
+ " 'actor': ['▁act', 'or'],\n",
+ " 'actress': ['▁act', 'ress'],\n",
+ " 'acts': ['▁act', 's'],\n",
+ " 'actual': ['▁act', 'ual'],\n",
+ " 'actually': ['▁act', 'ual', 'ly'],\n",
+ " 'adamafio': ['▁a', 'd', 'a', 'm', 'a', 'f', 'i', 'o'],\n",
+ " 'adaptation': ['▁a', 'd', 'a', 'p', 't', 'ation'],\n",
+ " 'adapted': ['▁a', 'd', 'a', 'p', 't', 'ed'],\n",
+ " 'adapting': ['▁a', 'd', 'a', 'p', 't', 'ing'],\n",
+ " 'add': ['▁add'],\n",
+ " 'added': ['▁add', 'ed'],\n",
+ " 'adding': ['▁add', 'ing'],\n",
+ " 'addition': ['▁add', 'it', 'ion'],\n",
+ " 'additions': ['▁add', 'it', 'ion', 's'],\n",
+ " 'address': ['▁add', 'ress'],\n",
+ " 'addressed': ['▁add', 'ress', 'ed'],\n",
+ " 'addresses': ['▁add', 'ress', 'es'],\n",
+ " 'addressing': ['▁add', 'ress', 'ing'],\n",
+ " 'adenauer': ['▁adenauer'],\n",
+ " \"adenauer's\": ['▁adenauer', \"'\", 's'],\n",
+ " 'adequate': ['▁a', 'd', 'equa', 'te'],\n",
+ " 'adhem': ['▁a', 'd', 'h', 'e', 'm'],\n",
+ " 'adjust': ['▁adjust'],\n",
+ " 'adjustment': ['▁adjust', 'ment'],\n",
+ " 'administration': ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ation'],\n",
+ " \"administration's\": ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ation', \"'\", 's'],\n",
+ " 'administrative': ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ative'],\n",
+ " 'admiralty': ['▁a', 'd', 'm', 'ir', 'al', 'ty'],\n",
+ " 'admire': ['▁a', 'd', 'm', 'ir', 'e'],\n",
+ " 'admit': ['▁a', 'd', 'm', 'it'],\n",
+ " 'admitted': ['▁a', 'd', 'mitt', 'ed'],\n",
+ " 'admitting': ['▁a', 'd', 'mitt', 'ing'],\n",
+ " 'adopted': ['▁a', 'd', 'op', 't', 'ed'],\n",
+ " 'adoption': ['▁a', 'd', 'o', 'ption'],\n",
+ " 'adult': ['▁a', 'd', 'ul', 't'],\n",
+ " 'advance': ['▁a', 'd', 'v', 'ance'],\n",
+ " 'advanced': ['▁a', 'd', 'v', 'ance', 'd'],\n",
+ " 'advancing': ['▁a', 'd', 'v', 'an', 'c', 'ing'],\n",
+ " 'advantage': ['▁', 'advantage'],\n",
+ " 'advantages': ['▁', 'advantage', 's'],\n",
+ " 'advertisement': ['▁advertisement'],\n",
+ " 'advertisements': ['▁advertisement', 's'],\n",
+ " 'advice': ['▁advi', 'ce'],\n",
+ " 'advisability': ['▁advi', 's', 'a', 'b', 'ilit', 'y'],\n",
+ " 'advise': ['▁advise'],\n",
+ " 'advised': ['▁advise', 'd'],\n",
+ " 'advisers': ['▁advise', 'r', 's'],\n",
+ " 'advocate': ['▁a', 'd', 'v', 'o', 'c', 'at', 'e'],\n",
+ " 'af-': ['▁a', 'f', '-'],\n",
+ " 'affairs': ['▁', 'aff', 'a', 'ir', 's'],\n",
+ " 'affected': ['▁', 'aff', 'e', 'ct', 'ed'],\n",
+ " 'affection': ['▁', 'aff', 'e', 'c', 'tion'],\n",
+ " 'affilia-': ['▁', 'aff', 'il', 'i', 'a', '-'],\n",
+ " 'affiliations': ['▁', 'aff', 'il', 'i', 'ation', 's'],\n",
+ " 'affluence': ['▁a', 'f', 'flu', 'ence'],\n",
+ " 'affluent': ['▁a', 'f', 'flu', 'ent'],\n",
+ " 'afraid': ['▁a', 'f', 'r', 'aid'],\n",
+ " 'africa': ['▁africa'],\n",
+ " \"africa's\": ['▁africa', \"'\", 's'],\n",
+ " 'african': ['▁african'],\n",
+ " 'africans': ['▁african', 's'],\n",
+ " 'after': ['▁after'],\n",
+ " 'afternoon': ['▁after', 'n', 'o', 'on'],\n",
+ " 'afterwards': ['▁after', 'ward', 's'],\n",
+ " 'again': ['▁again'],\n",
+ " 'against': ['▁again', 'st'],\n",
+ " 'age': ['▁', 'age'],\n",
+ " 'age-structure': ['▁', 'age', '-', 'struct', 'ure'],\n",
+ " 'aged': ['▁a', 'g', 'ed'],\n",
+ " 'ageing': ['▁', 'age', 'ing'],\n",
+ " 'agent': ['▁a', 'g', 'ent'],\n",
+ " 'agents': ['▁a', 'g', 'ent', 's'],\n",
+ " 'ages': ['▁', 'age', 's'],\n",
+ " 'agitation': ['▁a', 'g', 'it', 'ation'],\n",
+ " 'ago': ['▁', 'ago'],\n",
+ " 'agree': ['▁agree'],\n",
+ " 'agreed': ['▁agree', 'd'],\n",
+ " 'agreement': ['▁agree', 'ment'],\n",
+ " 'agreements': ['▁agree', 'ment', 's'],\n",
+ " 'agriculture': ['▁a', 'g', 'ri', 'c', 'ul', 'ture'],\n",
+ " 'ahead': ['▁a', 'h', 'e', 'ad'],\n",
+ " 'aid': ['▁', 'aid'],\n",
+ " 'aide': ['▁', 'aid', 'e'],\n",
+ " 'aided': ['▁', 'aid', 'ed'],\n",
+ " 'aides': ['▁', 'aid', 'es'],\n",
+ " 'aim': ['▁a', 'i', 'm'],\n",
+ " 'aimed': ['▁a', 'i', 'm', 'ed'],\n",
+ " 'aiming': ['▁a', 'i', 'm', 'ing'],\n",
+ " 'air': ['▁a', 'ir'],\n",
+ " 'aircraft': ['▁a', 'ir', 'c', 'ra', 'f', 't'],\n",
+ " 'aired': ['▁a', 'ir', 'ed'],\n",
+ " \"airliner's\": ['▁a', 'ir', 'lin', 'er', \"'\", 's'],\n",
+ " 'airmen': ['▁a', 'ir', 'm', 'en'],\n",
+ " 'airport': ['▁a', 'ir', 'port'],\n",
+ " 'akin': ['▁', 'ak', 'in'],\n",
+ " \"aladdin's\": ['▁', 'al', 'ad', 'd', 'in', \"'\", 's'],\n",
+ " 'alan': ['▁', 'al', 'an'],\n",
+ " 'alarmed': ['▁', 'al', 'ar', 'm', 'ed'],\n",
+ " 'alas': ['▁', 'al', 'a', 's'],\n",
+ " 'alcoholic': ['▁', 'al', 'c', 'o', 'h', 'o', 'l', 'ic'],\n",
+ " 'algeria': ['▁', 'al', 'ge', 'ri', 'a'],\n",
+ " 'alike': ['▁a', 'li', 'ke'],\n",
+ " 'alive': ['▁', 'al', 'ive'],\n",
+ " 'all': ['▁all'],\n",
+ " 'all-regular': ['▁all', '-', 're', 'g', 'ular'],\n",
+ " 'alleged': ['▁all', 'e', 'g', 'ed'],\n",
+ " 'allen': ['▁all', 'en'],\n",
+ " 'alleviation': ['▁all', 'evi', 'ation'],\n",
+ " 'alley': ['▁all', 'e', 'y'],\n",
+ " 'alliance': ['▁all', 'i', 'ance'],\n",
+ " 'alliances': ['▁all', 'i', 'ance', 's'],\n",
+ " 'allied': ['▁all', 'i', 'ed'],\n",
+ " 'allies': ['▁all', 'ies'],\n",
+ " 'allow': ['▁allow'],\n",
+ " 'allowance': ['▁allow', 'ance'],\n",
+ " 'allowances': ['▁allow', 'ance', 's'],\n",
+ " 'allowed': ['▁allow', 'ed'],\n",
+ " 'allowing': ['▁allow', 'ing'],\n",
+ " 'ally': ['▁', 'ally'],\n",
+ " 'almost': ['▁', 'al', 'most'],\n",
+ " 'alone': ['▁a', 'lo', 'ne'],\n",
+ " 'along': ['▁along'],\n",
+ " 'alongside': ['▁along', 'side'],\n",
+ " 'already': ['▁', 'al', 'ready'],\n",
+ " 'also': ['▁a', 'lso'],\n",
+ " 'alter': ['▁', 'al', 'ter'],\n",
+ " 'alternative': ['▁', 'al', 'ter', 'n', 'ative'],\n",
+ " 'alternatively': ['▁', 'al', 'ter', 'n', 'ative', 'ly'],\n",
+ " 'alternatives': ['▁', 'al', 'ter', 'n', 'ative', 's'],\n",
+ " 'although': ['▁', 'al', 'th', 'ough'],\n",
+ " 'altogether': ['▁alto', 'ge', 'ther'],\n",
+ " 'altos': ['▁alto', 's'],\n",
+ " 'always': ['▁a', 'lway', 's'],\n",
+ " 'am': ['▁a', 'm'],\n",
+ " 'amateur': ['▁a', 'm', 'at', 'e', 'ur'],\n",
+ " 'amazed': ['▁a', 'm', 'a', 'z', 'ed'],\n",
+ " 'amazing': ['▁a', 'm', 'a', 'z', 'ing'],\n",
+ " 'ambassador': ['▁a', 'm', 'b', 'a', 's', 's', 'ad', 'or'],\n",
+ " 'amber': ['▁a', 'mber'],\n",
+ " 'ambition': ['▁a', 'm', 'bit', 'ion'],\n",
+ " 'ambitious': ['▁a', 'm', 'bit', 'i', 'ous'],\n",
+ " 'ambulance': ['▁a', 'm', 'b', 'ul', 'ance'],\n",
+ " 'ambulances': ['▁a', 'm', 'b', 'ul', 'ance', 's'],\n",
+ " 'america': ['▁america'],\n",
+ " \"america's\": ['▁america', \"'\", 's'],\n",
+ " 'american': ['▁american'],\n",
+ " 'american-born': ['▁american', '-', 'b', 'orn'],\n",
+ " 'americans': ['▁american', 's'],\n",
+ " 'ammunition': ['▁a', 'm', 'm', 'un', 'it', 'ion'],\n",
+ " 'among': ['▁amo', 'ng'],\n",
+ " 'amount': ['▁amo', 'un', 't'],\n",
+ " 'ample': ['▁a', 'm', 'ple'],\n",
+ " 'amusement': ['▁a', 'm', 'us', 'e', 'ment'],\n",
+ " 'amusing': ['▁a', 'm', 'us', 'ing'],\n",
+ " 'an': ['▁an'],\n",
+ " 'analogy': ['▁an', 'al', 'o', 'g', 'y'],\n",
+ " 'analysed': ['▁an', 'al', 'y', 's', 'ed'],\n",
+ " 'anchor': ['▁an', 'ch', 'or'],\n",
+ " 'ancient': ['▁an', 'c', 'i', 'ent'],\n",
+ " 'and': ['▁and'],\n",
+ " 'andrei': ['▁and', 're', 'i'],\n",
+ " 'andrew': ['▁and', 're', 'w'],\n",
+ " 'anecdotal': ['▁an', 'e', 'c', 'd', 'o', 't', 'al'],\n",
+ " 'angel': ['▁a', 'ng', 'el'],\n",
+ " 'angeles': ['▁an', 'ge', 'le', 's'],\n",
+ " 'angelo': ['▁an', 'ge', 'lo'],\n",
+ " 'anger': ['▁a', 'ng', 'er'],\n",
+ " 'anglais': ['▁a', 'ng', 'l', 'a', 'is'],\n",
+ " 'angle': ['▁a', 'ng', 'le'],\n",
+ " 'anglesey': ['▁anglesey'],\n",
+ " \"anglesey's\": ['▁anglesey', \"'\", 's'],\n",
+ " 'anglesey-road': ['▁anglesey', '-', 'ro', 'ad'],\n",
+ " 'angola': ['▁a', 'ng', 'o', 'l', 'a'],\n",
+ " 'angry': ['▁a', 'ng', 'ry'],\n",
+ " 'ann': ['▁an', 'n'],\n",
+ " 'anna': ['▁an', 'n', 'a'],\n",
+ " 'announced': ['▁an', 'n', 'o', 'un', 'c', 'ed'],\n",
+ " 'announcement': ['▁an', 'n', 'o', 'un', 'ce', 'ment'],\n",
+ " 'announcing': ['▁an', 'n', 'o', 'un', 'c', 'ing'],\n",
+ " 'annual': ['▁an', 'n', 'ual'],\n",
+ " 'another': ['▁an', 'other'],\n",
+ " 'answer': ['▁answer'],\n",
+ " 'answered': ['▁answer', 'ed'],\n",
+ " 'answering': ['▁answer', 'ing'],\n",
+ " 'antagonism': ['▁', 'ant', 'ago', 'n', 'ism'],\n",
+ " 'anthony': ['▁an', 'th', 'on', 'y'],\n",
+ " 'anti-apartheid': ['▁', 'ant', 'i', '-', 'a', 'p', 'ar', 't', 'hei', 'd'],\n",
+ " 'anti-bomb': ['▁', 'ant', 'i', '-', 'bomb'],\n",
+ " 'anti-german': ['▁', 'ant', 'i', '-', 'g', 'er', 'man'],\n",
+ " 'anti-nato': ['▁', 'ant', 'i', '-', 'nato'],\n",
+ " 'anti-negro': ['▁', 'ant', 'i', '-', 'ne', 'g', 'ro'],\n",
+ " 'anti-nuclear': ['▁', 'ant', 'i', '-', 'n', 'u', 'c', 'le', 'ar'],\n",
+ " 'anti-soviet': ['▁', 'ant', 'i', '-', 's', 'o', 'v', 'i', 'et'],\n",
+ " 'anti-tory': ['▁', 'ant', 'i', '-', 'tory'],\n",
+ " 'anticipation': ['▁an', 'tic', 'ip', 'ation'],\n",
+ " 'antonioni': ['▁an', 'ton', 'ion', 'i'],\n",
+ " \"antonioni's\": ['▁an', 'ton', 'ion', 'i', \"'\", 's'],\n",
+ " 'any': ['▁any'],\n",
+ " 'any-': ['▁any', '-'],\n",
+ " 'anybody': ['▁any', 'body'],\n",
+ " \"anybody's\": ['▁any', 'body', \"'\", 's'],\n",
+ " 'anyone': ['▁any', 'on', 'e'],\n",
+ " 'anything': ['▁any', 'thing'],\n",
+ " 'apart': ['▁apart'],\n",
+ " 'apartheid': ['▁apart', 'hei', 'd'],\n",
+ " 'apathetic': ['▁a', 'p', 'at', 'h', 'e', 'tic'],\n",
+ " 'apathy': ['▁a', 'p', 'at', 'h', 'y'],\n",
+ " 'apex': ['▁a', 'p', 'e', 'x'],\n",
+ " 'apocalypse': ['▁a', 'p', 'o', 'c', 'al', 'y', 'p', 'se'],\n",
+ " 'apologising': ['▁a', 'p', 'o', 'lo', 'g', 'is', 'ing'],\n",
+ " 'appalled': ['▁', 'app', 'al', 'led'],\n",
+ " 'appalling': ['▁', 'app', 'al', 'l', 'ing'],\n",
+ " 'apparatus': ['▁', 'app', 'ar', 'at', 'us'],\n",
+ " 'apparent': ['▁', 'app', 'ar', 'ent'],\n",
+ " 'apparently': ['▁', 'app', 'ar', 'ent', 'ly'],\n",
+ " 'appeal': ['▁appeal'],\n",
+ " 'appealing': ['▁appeal', 'ing'],\n",
+ " 'appeals': ['▁appeal', 's'],\n",
+ " 'appear': ['▁appear'],\n",
+ " 'appearance': ['▁appear', 'ance'],\n",
+ " 'appeared': ['▁appear', 'ed'],\n",
+ " 'appears': ['▁appear', 's'],\n",
+ " 'appeasement': ['▁', 'app', 'e', 'a', 'se', 'ment'],\n",
+ " 'applauding': ['▁', 'app', 'l', 'a', 'ud', 'ing'],\n",
+ " 'appliances': ['▁', 'app', 'li', 'ance', 's'],\n",
+ " 'application': ['▁', 'app', 'l', 'ic', 'ation'],\n",
+ " 'applications': ['▁', 'app', 'l', 'ic', 'ation', 's'],\n",
+ " 'applied': ['▁', 'app', 'li', 'ed'],\n",
+ " 'apply': ['▁', 'app', 'ly'],\n",
+ " 'appointed': ['▁', 'app', 'o', 'in', 't', 'ed'],\n",
+ " 'appointment': ['▁', 'app', 'o', 'in', 't', 'ment'],\n",
+ " 'appreciable': ['▁', 'app', 're', 'c', 'i', 'able'],\n",
+ " 'appreciably': ['▁', 'app', 're', 'c', 'i', 'abl', 'y'],\n",
+ " 'appreciated': ['▁', 'app', 're', 'c', 'i', 'at', 'ed'],\n",
+ " 'apprenticeships': ['▁', 'app', 'ren', 'tic', 'e', 'ship', 's'],\n",
+ " 'approach': ['▁', 'app', 'ro', 'a', 'ch'],\n",
+ " 'approached': ['▁', 'app', 'ro', 'a', 'ch', 'ed'],\n",
+ " 'approaches': ['▁', 'app', 'ro', 'a', 'ches'],\n",
+ " 'appropriate': ['▁', 'app', 'ro', 'p', 'ri', 'at', 'e'],\n",
+ " 'appropriated': ['▁', 'app', 'ro', 'p', 'ri', 'at', 'ed'],\n",
+ " 'approval': ['▁', 'app', 'ro', 'val'],\n",
+ " 'approximately': ['▁', 'app', 'ro', 'x', 'i', 'm', 'at', 'e', 'ly'],\n",
+ " 'april': ['▁a', 'p', 'ri', 'l'],\n",
+ " 'archbishop': ['▁', 'ar', 'ch', 'b', 'ish', 'op'],\n",
+ " 'arches': ['▁', 'ar', 'ches'],\n",
+ " 'archipelago': ['▁', 'ar', 'ch', 'ip', 'el', 'ago'],\n",
+ " 'architect': ['▁', 'ar', 'ch', 'it', 'e', 'ct'],\n",
+ " 'architecture': ['▁', 'ar', 'ch', 'it', 'e', 'c', 'ture'],\n",
+ " 'are': ['▁are'],\n",
+ " 'area': ['▁are', 'a'],\n",
+ " 'areas': ['▁are', 'a', 's'],\n",
+ " 'arguably': ['▁argu', 'abl', 'y'],\n",
+ " 'argued': ['▁argu', 'ed'],\n",
+ " 'argues': ['▁argu', 'es'],\n",
+ " 'arguing': ['▁argu', 'ing'],\n",
+ " 'argument': ['▁argu', 'ment'],\n",
+ " 'arguments': ['▁argu', 'ment', 's'],\n",
+ " 'arise': ['▁', 'aris', 'e'],\n",
+ " 'arises': ['▁', 'aris', 'es'],\n",
+ " 'arm': ['▁arm'],\n",
+ " 'armament': ['▁', 'armament'],\n",
+ " 'armaments': ['▁', 'armament', 's'],\n",
+ " 'armed': ['▁arm', 'ed'],\n",
+ " 'armoured': ['▁arm', 'our', 'ed'],\n",
+ " 'arms': ['▁arm', 's'],\n",
+ " \"arms'\": ['▁arm', 's', \"'\"],\n",
+ " 'army': ['▁arm', 'y'],\n",
+ " 'arose': ['▁', 'ar', 'ose'],\n",
+ " 'around': ['▁a', 'round'],\n",
+ " 'aroused': ['▁', 'ar', 'ous', 'ed'],\n",
+ " 'arrange': ['▁arrang', 'e'],\n",
+ " 'arranged': ['▁arrang', 'ed'],\n",
+ " 'arrangement': ['▁arrang', 'e', 'ment'],\n",
+ " 'arrangements': ['▁arrang', 'e', 'ment', 's'],\n",
+ " 'arranging': ['▁arrang', 'ing'],\n",
+ " 'arrears': ['▁', 'ar', 're', 'ar', 's'],\n",
+ " 'arrested': ['▁', 'ar', 're', 'st', 'ed'],\n",
+ " 'arrival': ['▁arriv', 'al'],\n",
+ " 'arrive': ['▁arriv', 'e'],\n",
+ " 'arrived': ['▁arriv', 'ed'],\n",
+ " 'arrives': ['▁arriv', 'es'],\n",
+ " 'arrogant': ['▁', 'ar', 'ro', 'g', 'ant'],\n",
+ " 'art': ['▁', 'ar', 't'],\n",
+ " 'arthur': ['▁', 'ar', 'th', 'ur'],\n",
+ " 'article': ['▁', 'ar', 'tic', 'le'],\n",
+ " 'articles': ['▁', 'ar', 'tic', 'le', 's'],\n",
+ " 'articulation': ['▁', 'ar', 'tic', 'ul', 'ation'],\n",
+ " 'artistic': ['▁', 'ar', 'tist', 'ic'],\n",
+ " 'artists': ['▁', 'ar', 'tist', 's'],\n",
+ " 'as': ['▁as'],\n",
+ " 'ascents': ['▁as', 'c', 'ent', 's'],\n",
+ " 'ash': ['▁', 'ash'],\n",
+ " 'ashen': ['▁', 'ash', 'en'],\n",
+ " 'ask': ['▁ask'],\n",
+ " 'asked': ['▁ask', 'ed'],\n",
+ " 'asking': ['▁ask', 'ing'],\n",
+ " 'aspect': ['▁a', 'spect'],\n",
+ " 'aspects': ['▁a', 'spect', 's'],\n",
+ " 'assault': ['▁as', 's', 'a', 'ul', 't'],\n",
+ " 'assembler': ['▁as', 'se', 'm', 'ble', 'r'],\n",
+ " 'assembly': ['▁as', 'se', 'm', 'b', 'ly'],\n",
+ " 'assess': ['▁as', 'se', 's', 's'],\n",
+ " 'assessment': ['▁as', 'se', 's', 's', 'ment'],\n",
+ " 'assistance': ['▁as', 's', 'is', 't', 'ance'],\n",
+ " 'assistant': ['▁as', 's', 'is', 't', 'ant'],\n",
+ " 'assistants': ['▁as', 's', 'is', 't', 'ant', 's'],\n",
+ " 'associate': ['▁associat', 'e'],\n",
+ " 'associated': ['▁associat', 'ed'],\n",
+ " 'associates': ['▁associat', 'es'],\n",
+ " 'association': ['▁associat', 'ion'],\n",
+ " 'assortment': ['▁as', 's', 'or', 't', 'ment'],\n",
+ " 'assumption': ['▁assu', 'm', 'ption'],\n",
+ " 'assurance': ['▁assu', 'r', 'ance'],\n",
+ " 'astute': ['▁as', 'tu', 'te'],\n",
+ " 'at': ['▁at'],\n",
+ " 'ately': ['▁at', 'e', 'ly'],\n",
+ " 'atlantic': ['▁at', 'l', 'an', 'tic'],\n",
+ " 'atmosphere': ['▁atmospher', 'e'],\n",
+ " 'atmospheric': ['▁atmospher', 'ic'],\n",
+ " 'atomic': ['▁a', 'tom', 'ic'],\n",
+ " 'atoms': ['▁a', 'tom', 's'],\n",
+ " 'attach': ['▁at', 't', 'a', 'ch'],\n",
+ " 'attached': ['▁at', 't', 'a', 'ch', 'ed'],\n",
+ " 'attack': ['▁at', 'tack'],\n",
+ " 'attacked': ['▁at', 'tack', 'ed'],\n",
+ " 'attacks': ['▁at', 'tack', 's'],\n",
+ " 'attainable': ['▁at', 't', 'ain', 'able'],\n",
+ " 'attempt': ['▁attempt'],\n",
+ " 'attempted': ['▁attempt', 'ed'],\n",
+ " 'attempting': ['▁attempt', 'ing'],\n",
+ " 'attempts': ['▁attempt', 's'],\n",
+ " 'atten-': ['▁at', 'ten', '-'],\n",
+ " 'attend': ['▁at', 't', 'end'],\n",
+ " 'attendance': ['▁at', 't', 'end', 'ance'],\n",
+ " 'attended': ['▁at', 't', 'end', 'ed'],\n",
+ " 'attending': ['▁at', 't', 'end', 'ing'],\n",
+ " 'attention': ['▁at', 'ten', 'tion'],\n",
+ " 'attitude': ['▁at', 't', 'it', 'ud', 'e'],\n",
+ " 'attitudes': ['▁at', 't', 'it', 'ud', 'es'],\n",
+ " 'attracted': ['▁at', 't', 'ra', 'ct', 'ed'],\n",
+ " 'attractive': ['▁at', 't', 'ra', 'ct', 'ive'],\n",
+ " 'aubrey': ['▁a', 'u', 'b', 're', 'y'],\n",
+ " 'audacity': ['▁a', 'ud', 'a', 'c', 'it', 'y'],\n",
+ " 'auden': ['▁a', 'ud', 'en'],\n",
+ " 'audience': ['▁a', 'ud', 'i', 'ence'],\n",
+ " 'audited': ['▁a', 'ud', 'it', 'ed'],\n",
+ " 'august': ['▁a', 'ug', 'u', 'st'],\n",
+ " 'auntie': ['▁a', 'un', 't', 'ie'],\n",
+ " 'austerity': ['▁a', 'us', 'ter', 'it', 'y'],\n",
+ " 'australia': ['▁a', 'u', 'st', 'ra', 'li', 'a'],\n",
+ " 'austria': ['▁a', 'u', 'st', 'ri', 'a'],\n",
+ " 'austrian': ['▁a', 'u', 'st', 'ri', 'an'],\n",
+ " 'authentic': ['▁a', 'u', 'th', 'en', 'tic'],\n",
+ " 'author': ['▁', 'author'],\n",
+ " 'authorised': ['▁', 'author', 'is', 'ed'],\n",
+ " 'authorities': ['▁authorit', 'ies'],\n",
+ " 'authority': ['▁authorit', 'y'],\n",
+ " 'automatically': ['▁a', 'u', 'tom', 'at', 'ic', 'ally'],\n",
+ " 'automation': ['▁a', 'u', 'tom', 'ation'],\n",
+ " 'autumn': ['▁a', 'u', 't', 'um', 'n'],\n",
+ " 'available': ['▁a', 'v', 'a', 'il', 'able'],\n",
+ " 'avenue': ['▁a', 'v', 'en', 'u', 'e'],\n",
+ " 'average': ['▁a', 'ver', 'age'],\n",
+ " 'averages': ['▁a', 'ver', 'age', 's'],\n",
+ " 'avert': ['▁a', 'ver', 't'],\n",
+ " 'aviation': ['▁a', 'v', 'i', 'ation'],\n",
+ " 'avoid': ['▁a', 'v', 'o', 'i', 'd'],\n",
+ " 'avoided': ['▁a', 'v', 'o', 'i', 'd', 'ed'],\n",
+ " 'avon': ['▁a', 'v', 'on'],\n",
+ " 'awake': ['▁a', 'w', 'ak', 'e'],\n",
+ " 'awarded': ['▁a', 'ward', 'ed'],\n",
+ " 'awards': ['▁a', 'ward', 's'],\n",
+ " 'aware': ['▁a', 'w', 'ar', 'e'],\n",
+ " 'awareness': ['▁a', 'w', 'ar', 'e', 'ness'],\n",
+ " 'away': ['▁a', 'way'],\n",
+ " 'awful': ['▁a', 'w', 'ful'],\n",
+ " 'awfully': ['▁a', 'w', 'ful', 'ly'],\n",
+ " 'b': ['▁b'],\n",
+ " 'b.': ['▁b', '.'],\n",
+ " 'b.b.c.': ['▁b', '.', 'b', '.', 'c', '.'],\n",
+ " 'babe': ['▁ba', 'b', 'e'],\n",
+ " 'babel': ['▁ba', 'b', 'el'],\n",
+ " 'bably': ['▁b', 'abl', 'y'],\n",
+ " 'baby': ['▁ba', 'by'],\n",
+ " 'back': ['▁back'],\n",
+ " 'backbone': ['▁back', 'bo', 'ne'],\n",
+ " 'backed': ['▁back', 'ed'],\n",
+ " 'backers': ['▁back', 'ers'],\n",
+ " 'background': ['▁back', 'ground'],\n",
+ " 'backing': ['▁back', 'ing'],\n",
+ " 'backstage': ['▁back', 'st', 'age'],\n",
+ " 'backward': ['▁back', 'ward'],\n",
+ " 'bad': ['▁ba', 'd'],\n",
+ " 'badly': ['▁ba', 'd', 'ly'],\n",
+ " 'baffled': ['▁b', 'aff', 'led'],\n",
+ " 'bag': ['▁ba', 'g'],\n",
+ " 'bagaya': ['▁ba', 'g', 'a', 'y', 'a'],\n",
+ " 'baker': ['▁b', 'ak', 'er'],\n",
+ " 'balance': ['▁b', 'al', 'ance'],\n",
+ " 'balance-sheet': ['▁b', 'al', 'ance', '-', 's', 'h', 'e', 'et'],\n",
+ " 'balances': ['▁b', 'al', 'ance', 's'],\n",
+ " 'bald': ['▁ba', 'ld'],\n",
+ " 'ball': ['▁ba', 'll'],\n",
+ " 'balloon': ['▁b', 'al', 'lo', 'on'],\n",
+ " 'ballyhoo': ['▁b', 'ally', 'h', 'o', 'o'],\n",
+ " 'baltic': ['▁b', 'al', 'tic'],\n",
+ " 'ban': ['▁ba', 'n'],\n",
+ " 'ban-': ['▁ba', 'n', '-'],\n",
+ " 'ban-the-': ['▁ba', 'n', '-', 'th', 'e', '-'],\n",
+ " 'bank': ['▁bank'],\n",
+ " \"bank's\": ['▁bank', \"'\", 's'],\n",
+ " 'banking': ['▁bank', 'ing'],\n",
+ " 'bankrupt': ['▁bank', 'r', 'u', 'p', 't'],\n",
+ " 'banks': ['▁bank', 's'],\n",
+ " \"banks'\": ['▁bank', 's', \"'\"],\n",
+ " 'banned': ['▁ba', 'n', 'n', 'ed'],\n",
+ " 'banzie': ['▁ba', 'n', 'z', 'ie'],\n",
+ " 'bar': ['▁bar'],\n",
+ " 'barb': ['▁bar', 'b'],\n",
+ " 'barbarously': ['▁bar', 'b', 'ar', 'ous', 'ly'],\n",
+ " 'barclay': ['▁bar', 'c', 'lay'],\n",
+ " 'bare': ['▁bar', 'e'],\n",
+ " 'bargain': ['▁bar', 'g', 'ain'],\n",
+ " 'bargaining': ['▁bar', 'g', 'ain', 'ing'],\n",
+ " 'bark': ['▁bar', 'k'],\n",
+ " 'barrier': ['▁bar', 'ri', 'er'],\n",
+ " 'barriers': ['▁bar', 'ri', 'ers'],\n",
+ " 'barry': ['▁bar', 'ry'],\n",
+ " 'base': ['▁base'],\n",
+ " 'based': ['▁base', 'd'],\n",
+ " 'bases': ['▁base', 's'],\n",
+ " 'basic': ['▁basi', 'c'],\n",
+ " 'basin': ['▁basi', 'n'],\n",
+ " 'basing': ['▁ba', 's', 'ing'],\n",
+ " 'basis': ['▁basi', 's'],\n",
+ " 'baskerville': ['▁ba', 's', 'k', 'er', 'v', 'ill', 'e'],\n",
+ " 'basses': ['▁ba', 's', 'se', 's'],\n",
+ " 'basting': ['▁ba', 'st', 'ing'],\n",
+ " 'bats': ['▁b', 'at', 's'],\n",
+ " 'batsman': ['▁b', 'at', 's', 'man'],\n",
+ " 'battalions': ['▁b', 'at', 't', 'al', 'ion', 's'],\n",
+ " 'batting': ['▁b', 'at', 't', 'ing'],\n",
+ " 'battle': ['▁ba', 'ttle'],\n",
+ " 'bavaria': ['▁ba', 'v', 'ar', 'i', 'a'],\n",
+ " 'bavarian': ['▁ba', 'v', 'ar', 'ian'],\n",
+ " 'bavarians': ['▁ba', 'v', 'ar', 'ian', 's'],\n",
+ " 'bay': ['▁ba', 'y'],\n",
+ " 'be': ['▁be'],\n",
+ " 'beach': ['▁b', 'each'],\n",
+ " 'beaches': ['▁be', 'a', 'ches'],\n",
+ " 'beacon': ['▁be', 'a', 'c', 'on'],\n",
+ " 'beaks': ['▁be', 'ak', 's'],\n",
+ " 'bean': ['▁be', 'an'],\n",
+ " 'bear': ['▁be', 'ar'],\n",
+ " 'bearer': ['▁be', 'ar', 'er'],\n",
+ " 'bears': ['▁be', 'ar', 's'],\n",
+ " 'beastly': ['▁be', 'a', 'st', 'ly'],\n",
+ " 'beaten': ['▁be', 'at', 'en'],\n",
+ " 'beautiful': ['▁beautiful'],\n",
+ " 'beautifully': ['▁beautiful', 'ly'],\n",
+ " 'beauty': ['▁be', 'a', 'u', 'ty'],\n",
+ " 'became': ['▁be', 'c', 'a', 'me'],\n",
+ " 'because': ['▁be', 'cause'],\n",
+ " 'beckoning': ['▁be', 'ck', 'on', 'ing'],\n",
+ " 'become': ['▁become'],\n",
+ " 'becomes': ['▁become', 's'],\n",
+ " 'becoming': ['▁be', 'coming'],\n",
+ " 'bedlam': ['▁be', 'd', 'l', 'a', 'm'],\n",
+ " 'beds': ['▁be', 'd', 's'],\n",
+ " 'bedspreads': ['▁be', 'd', 's', 'p', 're', 'ad', 's'],\n",
+ " 'beech': ['▁be', 'e', 'ch'],\n",
+ " 'been': ['▁be', 'en'],\n",
+ " 'before': ['▁be', 'fore'],\n",
+ " 'befriended': ['▁be', 'f', 'ri', 'end', 'ed'],\n",
+ " 'began': ['▁be', 'g', 'an'],\n",
+ " 'begin': ['▁be', 'g', 'in'],\n",
+ " 'beginner': ['▁be', 'g', 'in', 'n', 'er'],\n",
+ " 'beginning': ['▁be', 'g', 'in', 'n', 'ing'],\n",
+ " 'begun': ['▁be', 'g', 'un'],\n",
+ " 'behan': ['▁be', 'h', 'an'],\n",
+ " 'behaviour': ['▁be', 'h', 'a', 'v', 'i', 'our'],\n",
+ " 'behind': ['▁be', 'h', 'ind'],\n",
+ " 'beier': ['▁be', 'i', 'er'],\n",
+ " 'being': ['▁be', 'ing'],\n",
+ " 'belgian': ['▁belgi', 'an'],\n",
+ " 'belgium': ['▁belgi', 'um'],\n",
+ " 'belgrade': ['▁be', 'l', 'g', 'rade'],\n",
+ " 'belief': ['▁be', 'li', 'e', 'f'],\n",
+ " 'believe': ['▁believe'],\n",
+ " 'believed': ['▁believe', 'd'],\n",
+ " 'believes': ['▁believe', 's'],\n",
+ " 'bell': ['▁be', 'll'],\n",
+ " \"bell's\": ['▁be', 'll', \"'\", 's'],\n",
+ " 'belonged': ['▁be', 'lo', 'ng', 'ed'],\n",
+ " 'belongs': ['▁be', 'lo', 'ng', 's'],\n",
+ " 'below': ['▁be', 'low'],\n",
+ " 'belt': ['▁be', 'l', 't'],\n",
+ " 'ben': ['▁be', 'n'],\n",
+ " 'bench': ['▁be', 'n', 'ch'],\n",
+ " 'benches': ['▁be', 'n', 'ches'],\n",
+ " 'bend': ['▁b', 'end'],\n",
+ " 'bending': ['▁b', 'end', 'ing'],\n",
+ " 'benefits': ['▁be', 'ne', 'f', 'it', 's'],\n",
+ " 'bent': ['▁b', 'ent'],\n",
+ " 'ber': ['▁be', 'r'],\n",
+ " 'berlin': ['▁berlin'],\n",
+ " \"berlin's\": ['▁berlin', \"'\", 's'],\n",
+ " 'bernhard': ['▁b', 'ern', 'h', 'ard'],\n",
+ " 'berry': ['▁be', 'rry'],\n",
+ " 'bertrand': ['▁be', 'r', 't', 'r', 'and'],\n",
+ " 'beset': ['▁be', 's', 'et'],\n",
+ " 'beside': ['▁be', 'side'],\n",
+ " 'best': ['▁best'],\n",
+ " 'best-seller': ['▁best', '-', 'se', 'll', 'er'],\n",
+ " 'bet': ['▁be', 't'],\n",
+ " 'betjeman': ['▁be', 't', 'j', 'e', 'man'],\n",
+ " 'betrayal': ['▁be', 't', 'ra', 'y', 'al'],\n",
+ " 'betrayed': ['▁be', 't', 'ra', 'y', 'ed'],\n",
+ " 'better': ['▁better'],\n",
+ " 'better-': ['▁better', '-'],\n",
+ " \"betti's\": ['▁be', 'tt', 'i', \"'\", 's'],\n",
+ " 'between': ['▁be', 'twe', 'en'],\n",
+ " 'bevel': ['▁be', 've', 'l'],\n",
+ " 'bevelled': ['▁be', 've', 'll', 'ed'],\n",
+ " 'beware': ['▁be', 'w', 'ar', 'e'],\n",
+ " 'bewildered': ['▁be', 'w', 'il', 'd', 'er', 'ed'],\n",
+ " 'beyond': ['▁be', 'y', 'on', 'd'],\n",
+ " 'bidet': ['▁b', 'i', 'd', 'et'],\n",
+ " 'big': ['▁big'],\n",
+ " 'bigger': ['▁big', 'g', 'er'],\n",
+ " 'biggest': ['▁big', 'g', 'est'],\n",
+ " 'bill': ['▁b', 'ill'],\n",
+ " 'bills': ['▁b', 'ill', 's'],\n",
+ " 'binding': ['▁b', 'ind', 'ing'],\n",
+ " 'biological': ['▁b', 'i', 'o', 'lo', 'g', 'ic', 'al'],\n",
+ " 'bird': ['▁b', 'ir', 'd'],\n",
+ " 'birds': ['▁b', 'ir', 'd', 's'],\n",
+ " 'bishop': ['▁b', 'ish', 'op'],\n",
+ " 'bit': ['▁', 'bit'],\n",
+ " 'bite': ['▁', 'bit', 'e'],\n",
+ " 'bits': ['▁', 'bit', 's'],\n",
+ " 'bitter-sweet': ['▁', 'bit', 'ter', '-', 's', 'w', 'e', 'et'],\n",
+ " 'bitterest': ['▁', 'bit', 'ter', 'est'],\n",
+ " 'bitterly': ['▁', 'bit', 'ter', 'ly'],\n",
+ " 'bituminized': ['▁', 'bit', 'um', 'in', 'i', 'z', 'ed'],\n",
+ " 'black': ['▁b', 'l', 'a', 'ck'],\n",
+ " 'black-': ['▁b', 'l', 'a', 'ck', '-'],\n",
+ " 'blackbird': ['▁b', 'l', 'a', 'ck', 'b', 'ir', 'd'],\n",
+ " 'blacks': ['▁b', 'l', 'a', 'ck', 's'],\n",
+ " 'blame': ['▁b', 'l', 'a', 'me'],\n",
+ " 'blamed': ['▁b', 'l', 'a', 'm', 'ed'],\n",
+ " 'blander': ['▁b', 'land', 'er'],\n",
+ " 'blank': ['▁b', 'l', 'an', 'k'],\n",
+ " 'blight': ['▁b', 'l', 'ight'],\n",
+ " 'blind': ['▁b', 'lin', 'd'],\n",
+ " 'blinked': ['▁b', 'lin', 'k', 'ed'],\n",
+ " 'block': ['▁block'],\n",
+ " 'blocks': ['▁block', 's'],\n",
+ " 'bloem-': ['▁b', 'lo', 'e', 'm', '-'],\n",
+ " 'blond': ['▁b', 'l', 'on', 'd'],\n",
+ " 'blood': ['▁b', 'l', 'ood'],\n",
+ " 'bloodstained': ['▁b', 'l', 'ood', 'st', 'ain', 'ed'],\n",
+ " 'bloody': ['▁b', 'l', 'ood', 'y'],\n",
+ " 'blouse': ['▁b', 'l', 'ous', 'e'],\n",
+ " 'blouses': ['▁b', 'l', 'ous', 'es'],\n",
+ " 'blow': ['▁b', 'low'],\n",
+ " 'blowflies': ['▁b', 'low', 'f', 'l', 'ies'],\n",
+ " 'blown': ['▁b', 'low', 'n'],\n",
+ " 'blue': ['▁b', 'l', 'u', 'e'],\n",
+ " 'blunt': ['▁b', 'l', 'un', 't'],\n",
+ " 'bluntly': ['▁b', 'l', 'un', 't', 'ly'],\n",
+ " 'bluster': ['▁b', 'l', 'us', 'ter'],\n",
+ " 'board': ['▁bo', 'ard'],\n",
+ " 'boat': ['▁bo', 'at'],\n",
+ " 'boat-train': ['▁bo', 'at', '-', 'train'],\n",
+ " 'bobby': ['▁bo', 'b', 'by'],\n",
+ " 'bodies': ['▁bo', 'd', 'ies'],\n",
+ " 'body': ['▁', 'body'],\n",
+ " 'boeing': ['▁bo', 'e', 'ing'],\n",
+ " 'bogy': ['▁bo', 'g', 'y'],\n",
+ " 'boiled': ['▁bo', 'il', 'ed'],\n",
+ " 'boils': ['▁bo', 'il', 's'],\n",
+ " 'bold': ['▁bo', 'ld'],\n",
+ " 'boldly': ['▁bo', 'ld', 'ly'],\n",
+ " 'bolted': ['▁bo', 'l', 't', 'ed'],\n",
+ " 'bomb': ['▁', 'bomb'],\n",
+ " 'bombay': ['▁', 'bomb', 'a', 'y'],\n",
+ " 'bombed': ['▁', 'bomb', 'ed'],\n",
+ " 'bombers': ['▁bo', 'mber', 's'],\n",
+ " 'bonded': ['▁bo', 'n', 'd', 'ed'],\n",
+ " 'bone': ['▁bo', 'ne'],\n",
+ " 'bones': ['▁bo', 'ne', 's'],\n",
+ " 'bonn': ['▁bo', 'n', 'n'],\n",
+ " 'book': ['▁bo', 'ok'],\n",
+ " 'booklet': ['▁bo', 'ok', 'le', 't'],\n",
+ " 'books': ['▁bo', 'ok', 's'],\n",
+ " 'booming': ['▁bo', 'o', 'm', 'ing'],\n",
+ " 'border': ['▁b', 'or', 'd', 'er'],\n",
+ " 'bore': ['▁bo', 're'],\n",
+ " 'bored': ['▁b', 'or', 'ed'],\n",
+ " 'boredom': ['▁b', 'or', 'ed', 'o', 'm'],\n",
+ " 'bores': ['▁bo', 're', 's'],\n",
+ " 'borough': ['▁b', 'or', 'ough'],\n",
+ " 'borrow': ['▁b', 'or', 'ro', 'w'],\n",
+ " 'borstal': ['▁b', 'or', 'st', 'al'],\n",
+ " 'bosoms': ['▁bo', 's', 'o', 'm', 's'],\n",
+ " 'bossed': ['▁bo', 's', 's', 'ed'],\n",
+ " 'bosses': ['▁bo', 's', 'se', 's'],\n",
+ " 'both': ['▁bo', 'th'],\n",
+ " 'bottle': ['▁bo', 'ttle'],\n",
+ " 'bottom': ['▁bo', 't', 'tom'],\n",
+ " 'bought': ['▁b', 'ought'],\n",
+ " 'boun': ['▁bo', 'un'],\n",
+ " 'bound': ['▁b', 'ound'],\n",
+ " 'boutiques': ['▁b', 'out', 'i', 'q', 'u', 'es'],\n",
+ " 'bow': ['▁bo', 'w'],\n",
+ " 'bow-street': ['▁bo', 'w', '-', 'street'],\n",
+ " 'bowed': ['▁bo', 'w', 'ed'],\n",
+ " 'bowing': ['▁bo', 'w', 'ing'],\n",
+ " 'bows': ['▁bo', 'w', 's'],\n",
+ " 'box': ['▁bo', 'x'],\n",
+ " 'boxes': ['▁bo', 'x', 'es'],\n",
+ " 'boxing': ['▁bo', 'x', 'ing'],\n",
+ " 'boy': ['▁bo', 'y'],\n",
+ " 'boycotted': ['▁bo', 'y', 'c', 'o', 'tt', 'ed'],\n",
+ " 'boycotting': ['▁bo', 'y', 'c', 'o', 'tt', 'ing'],\n",
+ " 'boyle': ['▁bo', 'y', 'le'],\n",
+ " 'boys': ['▁bo', 'y', 's'],\n",
+ " 'braces': ['▁br', 'a', 'ce', 's'],\n",
+ " 'brain': ['▁br', 'ain'],\n",
+ " 'brain-activity': ['▁br', 'ain', '-', 'a', 'ct', 'i', 'v', 'it', 'y'],\n",
+ " 'brain-children': ['▁br', 'ain', '-', 'ch', 'il', 'd', 'ren'],\n",
+ " 'brains': ['▁br', 'ain', 's'],\n",
+ " 'brandy': ['▁br', 'and', 'y'],\n",
+ " 'brash': ['▁br', 'ash'],\n",
+ " 'brass': ['▁br', 'a', 's', 's'],\n",
+ " 'brauchitsch': ['▁br', 'a', 'uch', 'it', 's', 'ch'],\n",
+ " 'breach': ['▁br', 'each'],\n",
+ " 'bread-and-butter': ['▁br', 'e', 'ad', '-', 'and', '-', 'b', 'u', 't', 'ter'],\n",
+ " 'break': ['▁br', 'e', 'ak'],\n",
+ " 'breaking': ['▁br', 'e', 'ak', 'ing'],\n",
+ " 'breaks': ['▁br', 'e', 'ak', 's'],\n",
+ " 'breath': ['▁br', 'e', 'at', 'h'],\n",
+ " 'breathing': ['▁br', 'e', 'a', 'thing'],\n",
+ " 'breathless': ['▁br', 'e', 'at', 'h', 'less'],\n",
+ " 'breeding': ['▁br', 'e', 'ed', 'ing'],\n",
+ " 'breezily': ['▁br', 'e', 'e', 'z', 'i', 'ly'],\n",
+ " 'brenda': ['▁br', 'end', 'a'],\n",
+ " 'brendan': ['▁br', 'end', 'an'],\n",
+ " \"brendan's\": ['▁br', 'end', 'an', \"'\", 's'],\n",
+ " 'brentano': ['▁br', 'ent', 'an', 'o'],\n",
+ " 'brezhnev': ['▁br', 'e', 'z', 'h', 'ne', 'v'],\n",
+ " 'brian': ['▁br', 'ian'],\n",
+ " 'bridal': ['▁', 'brid', 'al'],\n",
+ " 'bride': ['▁', 'brid', 'e'],\n",
+ " 'brief': ['▁brief'],\n",
+ " 'briefcase': ['▁brief', 'case'],\n",
+ " 'briefing': ['▁brief', 'ing'],\n",
+ " 'brigadiers': ['▁br', 'ig', 'ad', 'i', 'ers'],\n",
+ " 'bright': ['▁br', 'ight'],\n",
+ " 'brighter': ['▁br', 'ight', 'er'],\n",
+ " 'brightly': ['▁br', 'ight', 'ly'],\n",
+ " \"brighton's\": ['▁br', 'ight', 'on', \"'\", 's'],\n",
+ " 'brilliant': ['▁br', 'ill', 'ian', 't'],\n",
+ " 'brilliantly': ['▁br', 'ill', 'ian', 't', 'ly'],\n",
+ " 'bring': ['▁br', 'ing'],\n",
+ " 'brings': ['▁br', 'ing', 's'],\n",
+ " 'bristled': ['▁br', 'is', 't', 'led'],\n",
+ " 'britain': ['▁britain'],\n",
+ " \"britain's\": ['▁britain', \"'\", 's'],\n",
+ " 'british': ['▁british'],\n",
+ " 'british-owned': ['▁british', '-', 'ow', 'n', 'ed'],\n",
+ " 'britishers': ['▁british', 'ers'],\n",
+ " 'brittle': ['▁br', 'i', 'ttle'],\n",
+ " 'broad': ['▁br', 'o', 'ad'],\n",
+ " 'broadcast': ['▁br', 'o', 'ad', 'c', 'a', 'st'],\n",
+ " 'broadcasting': ['▁br', 'o', 'ad', 'c', 'a', 'st', 'ing'],\n",
+ " 'broke': ['▁br', 'ok', 'e'],\n",
+ " 'broken': ['▁br', 'ok', 'en'],\n",
+ " \"brook's\": ['▁br', 'o', 'ok', \"'\", 's'],\n",
+ " 'brother': ['▁br', 'other'],\n",
+ " 'brother-': ['▁br', 'other', '-'],\n",
+ " 'brother-in-law': ['▁br', 'other', '-', 'in', '-', 'law'],\n",
+ " 'brought': ['▁br', 'ought'],\n",
+ " 'brown': ['▁brown'],\n",
+ " \"brown's\": ['▁brown', \"'\", 's'],\n",
+ " 'bru\"cke': ['▁br', 'u', '\"', 'ck', 'e'],\n",
+ " 'bruce': ['▁br', 'u', 'ce'],\n",
+ " 'bruno': ['▁br', 'un', 'o'],\n",
+ " 'brunswick': ['▁br', 'un', 's', 'w', 'i', 'ck'],\n",
+ " 'brussels': ['▁br', 'us', 's', 'el', 's'],\n",
+ " 'brutal': ['▁br', 'u', 't', 'al'],\n",
+ " 'bryan': ['▁br', 'y', 'an'],\n",
+ " 'buck': ['▁b', 'u', 'ck'],\n",
+ " 'buckingham': ['▁b', 'u', 'ck', 'ing', 'h', 'a', 'm'],\n",
+ " 'buckley': ['▁b', 'u', 'ck', 'le', 'y'],\n",
+ " 'budge': ['▁b', 'ud', 'ge'],\n",
+ " 'budgerigar': ['▁b', 'ud', 'g', 'er', 'ig', 'ar'],\n",
+ " 'budget': ['▁b', 'ud', 'ge', 't'],\n",
+ " 'budgetary': ['▁b', 'ud', 'ge', 't', 'ar', 'y'],\n",
+ " 'budgette': ['▁b', 'ud', 'ge', 't', 'te'],\n",
+ " 'buganda': ['▁b', 'ug', 'and', 'a'],\n",
+ " 'build': ['▁b', 'u', 'il', 'd'],\n",
+ " 'building': ['▁b', 'u', 'il', 'd', 'ing'],\n",
+ " 'built': ['▁b', 'u', 'il', 't'],\n",
+ " 'bulk': ['▁b', 'ul', 'k'],\n",
+ " 'bulky': ['▁b', 'ul', 'k', 'y'],\n",
+ " 'bulletins': ['▁b', 'ul', 'le', 't', 'in', 's'],\n",
+ " 'bullying': ['▁b', 'ul', 'ly', 'ing'],\n",
+ " 'bumps': ['▁b', 'um', 'p', 's'],\n",
+ " 'bundle': ['▁b', 'un', 'd', 'le'],\n",
+ " 'bundy': ['▁b', 'un', 'd', 'y'],\n",
+ " 'bunk': ['▁b', 'un', 'k'],\n",
+ " 'burden': ['▁burde', 'n'],\n",
+ " 'burdens': ['▁burde', 'n', 's'],\n",
+ " 'burdett': ['▁burde', 'tt'],\n",
+ " 'bureau': ['▁b', 'ure', 'a', 'u'],\n",
+ " 'burge': ['▁b', 'ur', 'ge'],\n",
+ " 'burning': ['▁b', 'ur', 'n', 'ing'],\n",
+ " 'burst': ['▁b', 'ur', 'st'],\n",
+ " 'bush': ['▁bus', 'h'],\n",
+ " 'busi-': ['▁bus', 'i', '-'],\n",
+ " 'business': ['▁bus', 'i', 'ness'],\n",
+ " 'bustling': ['▁bus', 't', 'l', 'ing'],\n",
+ " 'busy': ['▁bus', 'y'],\n",
+ " 'but': ['▁but'],\n",
+ " 'butagas': ['▁but', 'a', 'g', 'a', 's'],\n",
+ " 'butler': ['▁but', 'l', 'er'],\n",
+ " 'butt': ['▁but', 't'],\n",
+ " 'buttery': ['▁but', 'ter', 'y'],\n",
+ " 'button': ['▁but', 'ton'],\n",
+ " 'buy': ['▁b', 'u', 'y'],\n",
+ " 'buying': ['▁b', 'u', 'y', 'ing'],\n",
+ " 'by': ['▁by'],\n",
+ " 'by-': ['▁by', '-'],\n",
+ " 'by-elections': ['▁by', '-', 'elect', 'ion', 's'],\n",
+ " 'by-passed': ['▁by', '-', 'p', 'a', 's', 's', 'ed'],\n",
+ " 'byrne': ['▁by', 'r', 'ne'],\n",
+ " 'byron': ['▁by', 'ro', 'n'],\n",
+ " 'c': ['▁c'],\n",
+ " 'c.c.f.': ['▁c', '.', 'c', '.', 'f', '.'],\n",
+ " 'c.i.g.s.': ['▁c', '.', 'i', '.', 'g', '.', 's', '.'],\n",
+ " 'cabinet': ['▁c', 'a', 'b', 'ine', 't'],\n",
+ " 'cable': ['▁c', 'able'],\n",
+ " 'cables': ['▁c', 'able', 's'],\n",
+ " 'cagnes': ['▁c', 'a', 'g', 'ne', 's'],\n",
+ " 'call': ['▁call'],\n",
+ " 'call-out': ['▁call', '-', 'out'],\n",
+ " 'callaghan': ['▁call', 'a', 'g', 'h', 'an'],\n",
+ " 'called': ['▁call', 'ed'],\n",
+ " 'calls': ['▁call', 's'],\n",
+ " 'cally': ['▁call', 'y'],\n",
+ " 'calm': ['▁c', 'al', 'm'],\n",
+ " 'calorie': ['▁c', 'al', 'or', 'ie'],\n",
+ " 'cambodia': ['▁cam', 'bo', 'd', 'i', 'a'],\n",
+ " 'cambridge': ['▁cam', 'brid', 'ge'],\n",
+ " 'came': ['▁cam', 'e'],\n",
+ " 'camp': ['▁cam', 'p'],\n",
+ " 'campaign': ['▁campaign'],\n",
+ " 'campaigns': ['▁campaign', 's'],\n",
+ " 'can': ['▁can'],\n",
+ " \"can't\": ['▁can', \"'\", 't'],\n",
+ " 'can-': ['▁can', '-'],\n",
+ " 'canada': ['▁can', 'ad', 'a'],\n",
+ " 'canadian': ['▁can', 'ad', 'ian'],\n",
+ " 'canadians': ['▁can', 'ad', 'ian', 's'],\n",
+ " 'canals': ['▁can', 'al', 's'],\n",
+ " 'canard': ['▁can', 'ard'],\n",
+ " 'cancelled': ['▁can', 'ce', 'll', 'ed'],\n",
+ " 'cancerous': ['▁can', 'c', 'er', 'ous'],\n",
+ " 'candi-': ['▁can', 'd', 'i', '-'],\n",
+ " 'candidate': ['▁can', 'did', 'at', 'e'],\n",
+ " 'candles': ['▁can', 'd', 'le', 's'],\n",
+ " 'cannot': ['▁can', 'not'],\n",
+ " 'canon': ['▁can', 'on'],\n",
+ " 'cantabile': ['▁can', 't', 'a', 'b', 'il', 'e'],\n",
+ " 'canteen': ['▁can', 'te', 'en'],\n",
+ " ...}"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "processor.lexicon"
+ ]
+ },
+ {
+ "cell_type": "code",
"execution_count": 167,
"id": "d08a0259",
"metadata": {},
diff --git a/notebooks/04b-look-at-iam-paragraphs.ipynb b/notebooks/04b-look-at-iam-paragraphs.ipynb
index dc0aef6..11ebddf 100644
--- a/notebooks/04b-look-at-iam-paragraphs.ipynb
+++ b/notebooks/04b-look-at-iam-paragraphs.ipynb
@@ -256,7 +256,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.2"
+ "version": "3.9.1"
}
},
"nbformat": 4,
diff --git a/text_recognizer/data/iam_preprocessor.py b/text_recognizer/data/iam_preprocessor.py
index d85787e..60f8a9f 100644
--- a/text_recognizer/data/iam_preprocessor.py
+++ b/text_recognizer/data/iam_preprocessor.py
@@ -119,7 +119,6 @@ class Preprocessor:
continue
self.text.append(example["text"].lower())
-
def _to_index(self, line: str) -> torch.LongTensor:
if line in self.special_tokens:
return torch.LongTensor([self.tokens_to_index[line]])
diff --git a/text_recognizer/data/mapping.py b/text_recognizer/data/mapping.py
deleted file mode 100644
index f0edf3f..0000000
--- a/text_recognizer/data/mapping.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""Mapping to and from word pieces."""
-from pathlib import Path
-
-
-class WordPieces:
-
- def __init__(self) -> None:
- pass
diff --git a/text_recognizer/data/mappings.py b/text_recognizer/data/mappings.py
new file mode 100644
index 0000000..cfa0ec7
--- /dev/null
+++ b/text_recognizer/data/mappings.py
@@ -0,0 +1,143 @@
+"""Mapping to and from word pieces."""
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List, Optional, Union, Sequence
+
+from loguru import logger
+import torch
+from torch import Tensor
+
+from text_recognizer.data.emnist import emnist_mapping
+from text_recognizer.datasets.iam_preprocessor import Preprocessor
+
+
+class AbstractMapping(ABC):
+ @abstractmethod
+ def get_token(self, *args, **kwargs) -> str:
+ ...
+
+ @abstractmethod
+ def get_index(self, *args, **kwargs) -> Tensor:
+ ...
+
+ @abstractmethod
+ def get_text(self, *args, **kwargs) -> str:
+ ...
+
+ @abstractmethod
+ def get_indices(self, *args, **kwargs) -> Tensor:
+ ...
+
+
+class EmnistMapping(AbstractMapping):
+ def __init__(self, extra_symbols: Optional[Sequence[str]]) -> None:
+ self.mapping, self.inverse_mapping, self.input_size = emnist_mapping(
+ extra_symbols
+ )
+
+ def get_token(self, index: Union[int, Tensor]) -> str:
+ if (index := int(index)) in self.mapping:
+ return self.mapping[index]
+ raise KeyError(f"Index ({index}) not in mapping.")
+
+ def get_index(self, token: str) -> Tensor:
+ if token in self.inverse_mapping:
+ return Tensor(self.inverse_mapping[token])
+ raise KeyError(f"Token ({token}) not found in inverse mapping.")
+
+ def get_text(self, indices: Union[List[int], Tensor]) -> str:
+ if isinstance(indices, Tensor):
+ indices = indices.tolist()
+ return "".join([self.mapping[index] for index in indices])
+
+ def get_indices(self, text: str) -> Tensor:
+ return Tensor([self.inverse_mapping[token] for token in text])
+
+
+class WordPieceMapping(EmnistMapping):
+ def __init__(
+ self,
+ num_features: int,
+ tokens: str,
+ lexicon: str,
+ data_dir: Optional[Union[str, Path]] = None,
+ use_words: bool = False,
+ prepend_wordsep: bool = False,
+ special_tokens: Sequence[str] = ("<s>", "<e>", "<p>"),
+ extra_symbols: Optional[Sequence[str]] = None,
+ ) -> None:
+ super().__init__(extra_symbols)
+ self.wordpiece_processor = self._configure_wordpiece_processor(
+ num_features,
+ tokens,
+ lexicon,
+ data_dir,
+ use_words,
+ prepend_wordsep,
+ special_tokens,
+ extra_symbols,
+ )
+
+ def _configure_wordpiece_processor(
+ self,
+ num_features: int,
+ tokens: str,
+ lexicon: str,
+ data_dir: Optional[Union[str, Path]],
+ use_words: bool,
+ prepend_wordsep: bool,
+ special_tokens: Optional[Sequence[str]],
+ extra_symbols: Optional[Sequence[str]],
+ ) -> Preprocessor:
+ data_dir = (
+ (Path(__file__).resolve().parents[2] / "data" / "raw" / "iam" / "iamdb")
+ if data_dir is None
+ else Path(data_dir)
+ )
+
+ logger.debug(f"Using data dir: {data_dir}")
+ if not data_dir.exists():
+ raise RuntimeError(f"Could not locate iamdb directory at {data_dir}")
+
+ processed_path = (
+ Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines"
+ )
+
+ tokens_path = processed_path / tokens
+ lexicon_path = processed_path / lexicon
+
+ if extra_symbols is not None:
+ special_tokens += extra_symbols
+
+ return Preprocessor(
+ data_dir,
+ num_features,
+ tokens_path,
+ lexicon_path,
+ use_words,
+ prepend_wordsep,
+ special_tokens,
+ )
+
+ def get_token(self, index: Union[int, Tensor]) -> str:
+ if (index := int(index)) <= self.wordpiece_processor.num_tokens:
+ return self.wordpiece_processor.tokens[index]
+ raise KeyError(f"Index ({index}) not in mapping.")
+
+ def get_index(self, token: str) -> Tensor:
+ if token in self.wordpiece_processor.tokens:
+ return torch.LongTensor(self.wordpiece_processor.tokens_to_index[token])
+ raise KeyError(f"Token ({token}) not found in inverse mapping.")
+
+ def get_text(self, indices: Union[List[int], Tensor]) -> str:
+ if isinstance(indices, Tensor):
+ indices = indices.tolist()
+ return self.wordpiece_processor.to_text(indices)
+
+ def get_indices(self, text: str) -> Tensor:
+ return self.wordpiece_processor.to_index(text)
+
+ def emnist_to_wordpiece_indices(self, x: Tensor) -> Tensor:
+ text = self.mapping.get_text(x)
+ text = text.lower().replace(" ", "▁")
+ return torch.LongTensor(self.wordpiece_processor.to_index(text))
diff --git a/text_recognizer/data/transforms.py b/text_recognizer/data/transforms.py
index 297c953..f53df64 100644
--- a/text_recognizer/data/transforms.py
+++ b/text_recognizer/data/transforms.py
@@ -1,115 +1,36 @@
"""Transforms for PyTorch datasets."""
-from abc import abstractmethod
from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Optional, Union, Sequence
-from loguru import logger
-import torch
from torch import Tensor
-from text_recognizer.datasets.iam_preprocessor import Preprocessor
-from text_recognizer.data.emnist import emnist_mapping
+from text_recognizer.datasets.mappings import WordPieceMapping
-class ToLower:
- """Converts target to lower case."""
-
- def __call__(self, target: Tensor) -> Tensor:
- """Corrects index value in target tensor."""
- device = target.device
- return torch.stack([x - 26 if x > 35 else x for x in target]).to(device)
-
-
-class ToCharcters:
- """Converts integers to characters."""
-
- def __init__(self, extra_symbols: Optional[List[str]] = None) -> None:
- self.mapping, _, _ = emnist_mapping(extra_symbols)
-
- def __call__(self, y: Tensor) -> str:
- """Converts a Tensor to a str."""
- return "".join([self.mapping[int(i)] for i in y]).replace(" ", "▁")
-
-
-class WordPieces:
- """Abstract transform for word pieces."""
+class WordPiece:
+ """Converts EMNIST indices to Word Piece indices."""
def __init__(
self,
num_features: int,
+ tokens: str,
+ lexicon: str,
data_dir: Optional[Union[str, Path]] = None,
- tokens: Optional[Union[str, Path]] = None,
- lexicon: Optional[Union[str, Path]] = None,
use_words: bool = False,
prepend_wordsep: bool = False,
+ special_tokens: Sequence[str] = ("<s>", "<e>", "<p>"),
+ extra_symbols: Optional[Sequence[str]] = None,
) -> None:
- if data_dir is None:
- data_dir = (
- Path(__file__).resolve().parents[3] / "data" / "raw" / "iam" / "iamdb"
- )
- logger.debug(f"Using data dir: {data_dir}")
- if not data_dir.exists():
- raise RuntimeError(f"Could not locate iamdb directory at {data_dir}")
- else:
- data_dir = Path(data_dir)
- processed_path = (
- Path(__file__).resolve().parents[3] / "data" / "processed" / "iam_lines"
- )
- tokens_path = processed_path / tokens
- lexicon_path = processed_path / lexicon
-
- self.preprocessor = Preprocessor(
- data_dir,
+ self.mapping = WordPieceMapping(
num_features,
- tokens_path,
- lexicon_path,
+ tokens,
+ lexicon,
+ data_dir,
use_words,
prepend_wordsep,
+ special_tokens,
+ extra_symbols,
)
- @abstractmethod
- def __call__(self, *args, **kwargs) -> Any:
- """Transforms input."""
- ...
-
-
-class ToWordPieces(WordPieces):
- """Transforms str to word pieces."""
-
- def __init__(
- self,
- num_features: int,
- data_dir: Optional[Union[str, Path]] = None,
- tokens: Optional[Union[str, Path]] = None,
- lexicon: Optional[Union[str, Path]] = None,
- use_words: bool = False,
- prepend_wordsep: bool = False,
- ) -> None:
- super().__init__(
- num_features, data_dir, tokens, lexicon, use_words, prepend_wordsep
- )
-
- def __call__(self, line: str) -> Tensor:
- """Transforms str to word pieces."""
- return self.preprocessor.to_index(line)
-
-
-class ToText(WordPieces):
- """Takes word pieces and converts them to text."""
-
- def __init__(
- self,
- num_features: int,
- data_dir: Optional[Union[str, Path]] = None,
- tokens: Optional[Union[str, Path]] = None,
- lexicon: Optional[Union[str, Path]] = None,
- use_words: bool = False,
- prepend_wordsep: bool = False,
- ) -> None:
- super().__init__(
- num_features, data_dir, tokens, lexicon, use_words, prepend_wordsep
- )
-
- def __call__(self, x: Tensor) -> str:
- """Converts tensor to text."""
- return self.preprocessor.to_text(x.tolist())
+ def __call__(self, x: Tensor) -> Tensor:
+ return self.mapping.emnist_to_wordpiece_indices(x)
diff --git a/text_recognizer/models/base.py b/text_recognizer/models/base.py
index c6d5d73..aeda039 100644
--- a/text_recognizer/models/base.py
+++ b/text_recognizer/models/base.py
@@ -49,7 +49,9 @@ class LitBaseModel(pl.LightningModule):
optimizer_class = getattr(torch.optim, self._optimizer.type)
return optimizer_class(params=self.parameters(), **args)
- def _configure_lr_scheduler(self, optimizer: Type[torch.optim.Optimizer]) -> Dict[str, Any]:
+ def _configure_lr_scheduler(
+ self, optimizer: Type[torch.optim.Optimizer]
+ ) -> Dict[str, Any]:
"""Configures the lr scheduler."""
scheduler = {"monitor": self.monitor}
args = {} or self._lr_scheduler.args
@@ -59,7 +61,7 @@ class LitBaseModel(pl.LightningModule):
scheduler["scheduler"] = getattr(
torch.optim.lr_scheduler, self._lr_scheduler.type
- )(optimizer, **args)
+ )(optimizer, **args)
return scheduler
diff --git a/text_recognizer/networks/image_transformer.py b/text_recognizer/networks/image_transformer.py
index daededa..a6aaca4 100644
--- a/text_recognizer/networks/image_transformer.py
+++ b/text_recognizer/networks/image_transformer.py
@@ -44,7 +44,9 @@ class ImageTransformer(nn.Module):
dropout_rate: float = 0.1,
transformer_activation: str = "glu",
) -> None:
- self.vocab_size = NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size
+ self.vocab_size = (
+ NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size
+ )
self.hidden_dim = hidden_dim
self.max_output_length = output_shape[0]
diff --git a/training/configs/image_transformer.yaml b/training/configs/image_transformer.yaml
index 88c05c2..228e53f 100644
--- a/training/configs/image_transformer.yaml
+++ b/training/configs/image_transformer.yaml
@@ -56,6 +56,7 @@ callbacks:
args:
monitor: val_loss
mode: min
+ save_last: true
- type: StochasticWeightAveraging
args:
swa_epoch_start: 0.8
diff --git a/training/run_experiment.py b/training/run_experiment.py
index f46803f..1e79461 100644
--- a/training/run_experiment.py
+++ b/training/run_experiment.py
@@ -15,7 +15,7 @@ import wandb
SEED = 4711
CONFIGS_DIRNAME = Path(__file__).parent.resolve() / "configs"
-LOGS_DIRNAME = Path(__file__).parent.resolve() / "runs" / "logs"
+LOGS_DIRNAME = Path(__file__).parent.resolve() / "logs"
def _configure_logging(log_dir: Optional[Path], verbose: int = 0) -> None:
@@ -67,7 +67,7 @@ def _configure_callbacks(
def _configure_logger(
- network: Type[nn.Module], args: Dict, log_dir: str, use_wandb: bool
+ network: Type[nn.Module], args: Dict, log_dir: str, use_wandb: bool
) -> Type[pl.loggers.LightningLoggerBase]:
"""Configures lightning logger."""
if use_wandb:
@@ -124,7 +124,6 @@ def run(
_configure_logging(None, verbose=verbose)
logger.info("Starting experiment...")
-
# Seed everything in the experiment.
logger.info(f"Seeding everthing with seed={SEED}")
pl.utilities.seed.seed_everything(SEED)