diff options
-rw-r--r-- | notebooks/03-look-at-iam-paragraphs.ipynb | 2126 | ||||
-rw-r--r-- | notebooks/04b-look-at-iam-paragraphs.ipynb | 2 | ||||
-rw-r--r-- | text_recognizer/data/iam_preprocessor.py | 1 | ||||
-rw-r--r-- | text_recognizer/data/mapping.py | 8 | ||||
-rw-r--r-- | text_recognizer/data/mappings.py | 143 | ||||
-rw-r--r-- | text_recognizer/data/transforms.py | 111 | ||||
-rw-r--r-- | text_recognizer/models/base.py | 6 | ||||
-rw-r--r-- | text_recognizer/networks/image_transformer.py | 4 | ||||
-rw-r--r-- | training/configs/image_transformer.yaml | 1 | ||||
-rw-r--r-- | training/run_experiment.py | 5 |
10 files changed, 2293 insertions, 114 deletions
diff --git a/notebooks/03-look-at-iam-paragraphs.ipynb b/notebooks/03-look-at-iam-paragraphs.ipynb index cfa0ba5..7599e11 100644 --- a/notebooks/03-look-at-iam-paragraphs.ipynb +++ b/notebooks/03-look-at-iam-paragraphs.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "6ce2519f", "metadata": {}, "outputs": [], @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "45649194", "metadata": {}, "outputs": [], @@ -223,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "id": "0fc13f9f", "metadata": {}, "outputs": [], @@ -241,6 +241,1105 @@ }, { "cell_type": "code", + "execution_count": 12, + "id": "c70e8873", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['!',\n", + " '\"',\n", + " '&',\n", + " \"'\",\n", + " '(',\n", + " ')',\n", + " '*',\n", + " '+',\n", + " ',',\n", + " '-',\n", + " '.',\n", + " '/',\n", + " '0',\n", + " '1',\n", + " '2',\n", + " '3',\n", + " '4',\n", + " '5',\n", + " '6',\n", + " '7',\n", + " '8',\n", + " '9',\n", + " ':',\n", + " ';',\n", + " '?',\n", + " 'a',\n", + " 'b',\n", + " 'c',\n", + " 'd',\n", + " 'e',\n", + " 'f',\n", + " 'g',\n", + " 'h',\n", + " 'i',\n", + " 'j',\n", + " 'k',\n", + " 'l',\n", + " 'm',\n", + " 'n',\n", + " 'o',\n", + " 'p',\n", + " 'q',\n", + " 'r',\n", + " 's',\n", + " 't',\n", + " 'u',\n", + " 'v',\n", + " 'w',\n", + " 'x',\n", + " 'y',\n", + " 'z',\n", + " '▁',\n", + " '<s>',\n", + " '<e>',\n", + " '<p>',\n", + " '\\n']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processor.graphemes" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6def52f6", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['/',\n", + " '▁',\n", + " 's',\n", + " '▁the',\n", + " ',',\n", + " '▁.',\n", + " '▁a',\n", + " 'e',\n", + " '▁to',\n", + " 'ed',\n", + " '▁of',\n", + " '▁in',\n", + " 'ing',\n", + " 'y',\n", + " '▁and',\n", + " 't',\n", + " 'm',\n", + " 'd',\n", + " '▁be',\n", + " 'at',\n", + " 'a',\n", + " 'n',\n", + " '▁th',\n", + " 'i',\n", + " '▁he',\n", + " 'p',\n", + " 'o',\n", + " 'en',\n", + " 'al',\n", + " '▁is',\n", + " '-',\n", + " 'ly',\n", + " '.',\n", + " '▁w',\n", + " 'c',\n", + " '▁for',\n", + " 'l',\n", + " '\"',\n", + " '▁it',\n", + " 'er',\n", + " '▁was',\n", + " 'u',\n", + " '▁on',\n", + " '▁with',\n", + " \"'\",\n", + " '▁his',\n", + " 'g',\n", + " 'h',\n", + " '▁had',\n", + " '▁mr',\n", + " 'in',\n", + " '▁i',\n", + " 'or',\n", + " '▁f',\n", + " 'r',\n", + " \"▁'\",\n", + " '▁but',\n", + " '▁by',\n", + " 'ar',\n", + " '▁as',\n", + " '▁not',\n", + " 'b',\n", + " 'it',\n", + " 'ir',\n", + " 'v',\n", + " '▁re',\n", + " 'st',\n", + " '▁an',\n", + " '▁has',\n", + " '▁are',\n", + " 'this',\n", + " 'f',\n", + " 'on',\n", + " 'hich',\n", + " 'ce',\n", + " '▁who',\n", + " '▁have',\n", + " 'ion',\n", + " 'ch',\n", + " '▁p',\n", + " 're',\n", + " '▁at',\n", + " '▁de',\n", + " '▁m',\n", + " 'w',\n", + " 'le',\n", + " 'ri',\n", + " '▁so',\n", + " '▁fro',\n", + " 'ation',\n", + " 'ic',\n", + " '▁do',\n", + " 'ould',\n", + " '▁will',\n", + " '▁b',\n", + " 'k',\n", + " 'is',\n", + " 'ment',\n", + " 'il',\n", + " '▁st',\n", + " '▁no',\n", + " 'an',\n", + " 'ter',\n", + " 'se',\n", + " '▁we',\n", + " 'ur',\n", + " 'ies',\n", + " '▁un',\n", + " '▁her',\n", + " 'th',\n", + " 'es',\n", + " 'ent',\n", + " 'et',\n", + " '▁-',\n", + " '▁all',\n", + " '▁c',\n", + " '▁there',\n", + " 'un',\n", + " 've',\n", + " '▁can',\n", + " 'ne',\n", + " '▁up',\n", + " '▁one',\n", + " '▁were',\n", + " '▁me',\n", + " '▁br',\n", + " '▁com',\n", + " '▁more',\n", + " '▁se',\n", + " 'ro',\n", + " '▁per',\n", + " 'ld',\n", + " 'ge',\n", + " '▁man',\n", + " 'est',\n", + " '▁pa',\n", + " 'ul',\n", + " 'el',\n", + " 'if',\n", + " '▁government',\n", + " '▁or',\n", + " '▁she',\n", + " 'ill',\n", + " '▁co',\n", + " '▁year',\n", + " 'ad',\n", + " '▁bo',\n", + " 'te',\n", + " '▁than',\n", + " '▁r',\n", + " 'll',\n", + " 'able',\n", + " 'ous',\n", + " '▁what',\n", + " '▁out',\n", + " 'ck',\n", + " '!',\n", + " '▁new',\n", + " '▁dis',\n", + " 'lo',\n", + " 'ra',\n", + " 'uch',\n", + " '▁him',\n", + " 'age',\n", + " 'ct',\n", + " '▁go',\n", + " '▁con',\n", + " 'day',\n", + " 'z',\n", + " 'op',\n", + " 'ther',\n", + " '▁you',\n", + " '▁dr',\n", + " 'us',\n", + " '▁1',\n", + " '▁may',\n", + " 'ers',\n", + " ':',\n", + " '▁first',\n", + " 'li',\n", + " '0',\n", + " 'end',\n", + " 'ng',\n", + " '▁people',\n", + " '▁some',\n", + " 'our',\n", + " '▁anglesey',\n", + " '▁two',\n", + " '▁wh',\n", + " 'tion',\n", + " '▁lo',\n", + " '▁sir',\n", + " 'ig',\n", + " 'ain',\n", + " 'tic',\n", + " '▁now',\n", + " 'ive',\n", + " 'and',\n", + " '▁whe',\n", + " ';',\n", + " 'ry',\n", + " 'um',\n", + " '▁other',\n", + " '▁su',\n", + " '▁ho',\n", + " '▁ha',\n", + " '▁ra',\n", + " '▁even',\n", + " 'ish',\n", + " 'ard',\n", + " 'wn',\n", + " '▁li',\n", + " 'ten',\n", + " 'ok',\n", + " '▁k',\n", + " 'ver',\n", + " 'into',\n", + " '▁na',\n", + " '▁vi',\n", + " '▁po',\n", + " '▁common',\n", + " 'ight',\n", + " '▁over',\n", + " '▁like',\n", + " '▁mad',\n", + " 'ud',\n", + " 'ow',\n", + " '▁any',\n", + " '▁again',\n", + " '▁great',\n", + " '▁last',\n", + " 'ough',\n", + " 'ose',\n", + " 'ant',\n", + " '▁under',\n", + " 'ful',\n", + " 'self',\n", + " '▁how',\n", + " '▁sai',\n", + " 'x',\n", + " '▁most',\n", + " 'ine',\n", + " '▁give',\n", + " '▁about',\n", + " 'ance',\n", + " 'me',\n", + " 'fore',\n", + " 'cu',\n", + " '(',\n", + " '▁minister',\n", + " '▁labour',\n", + " '▁talk',\n", + " '▁work',\n", + " '▁take',\n", + " '▁writ',\n", + " '▁whil',\n", + " '▁well',\n", + " '▁plan',\n", + " '▁la',\n", + " '▁pri',\n", + " '▁play',\n", + " '▁house',\n", + " 'mber',\n", + " '▁ex',\n", + " '▁good',\n", + " 'ern',\n", + " 'ple',\n", + " 'ble',\n", + " '▁german',\n", + " 'ak',\n", + " 'ever',\n", + " '▁time',\n", + " '▁pro',\n", + " '▁ga',\n", + " '▁germany',\n", + " '▁britain',\n", + " '▁should',\n", + " '▁polic',\n", + " '▁found',\n", + " '▁part',\n", + " 'ood',\n", + " '▁g',\n", + " '▁west',\n", + " 'thing',\n", + " '▁cur',\n", + " '▁know',\n", + " '▁back',\n", + " '▁ma',\n", + " '▁show',\n", + " ')',\n", + " 'ure',\n", + " 'ian',\n", + " 'app',\n", + " '▁ever',\n", + " '▁use',\n", + " 'ally',\n", + " 'led',\n", + " '▁miss',\n", + " '▁car',\n", + " '▁nation',\n", + " 'ttle',\n", + " '▁end',\n", + " '▁gr',\n", + " '▁must',\n", + " '▁tr',\n", + " '▁du',\n", + " '▁get',\n", + " '▁unit',\n", + " 'less',\n", + " '▁agree',\n", + " 'ie',\n", + " 'tom',\n", + " '..',\n", + " '▁di',\n", + " '▁men',\n", + " '4',\n", + " '▁my',\n", + " '▁today',\n", + " 'ves',\n", + " 'hap',\n", + " 'tory',\n", + " 'ence',\n", + " 'out',\n", + " '▁its',\n", + " '8',\n", + " '▁believe',\n", + " '▁say',\n", + " '▁make',\n", + " 'for',\n", + " '▁them',\n", + " '▁president',\n", + " '▁return',\n", + " '▁british',\n", + " '▁after',\n", + " '▁earl',\n", + " '▁here',\n", + " 'ton',\n", + " 'ext',\n", + " '▁life',\n", + " '▁put',\n", + " '▁war',\n", + " '▁2',\n", + " '▁cont',\n", + " '▁every',\n", + " '▁god',\n", + " '▁strong',\n", + " 'ren',\n", + " 'ty',\n", + " '▁mon',\n", + " 'man',\n", + " 'very',\n", + " '5',\n", + " '6',\n", + " '▁yester',\n", + " '▁week',\n", + " '▁film',\n", + " 'ual',\n", + " '▁need',\n", + " 'lso',\n", + " 'aid',\n", + " 'law',\n", + " 'ward',\n", + " '▁ask',\n", + " 'bo',\n", + " '▁pre',\n", + " '▁conference',\n", + " '▁real',\n", + " '▁roy',\n", + " 'ince',\n", + " '▁us',\n", + " 'ke',\n", + " '▁franc',\n", + " '▁direct',\n", + " '▁child',\n", + " 'cause',\n", + " '▁long',\n", + " '▁action',\n", + " '▁hope',\n", + " '▁live',\n", + " '▁see',\n", + " '▁point',\n", + " '▁char',\n", + " '▁way',\n", + " 'val',\n", + " '▁add',\n", + " '▁ba',\n", + " 'tt',\n", + " 'qui',\n", + " 'ism',\n", + " 'other',\n", + " '▁character',\n", + " '&',\n", + " '▁kennedy',\n", + " '▁macmillan',\n", + " '▁service',\n", + " '▁open',\n", + " '▁tri',\n", + " '▁country',\n", + " 'orn',\n", + " 'abl',\n", + " '▁off',\n", + " '▁fac',\n", + " '▁effective',\n", + " 'rote',\n", + " '▁let',\n", + " 'each',\n", + " '▁cent',\n", + " '▁party',\n", + " '▁act',\n", + " '▁prim',\n", + " '▁str',\n", + " 'amp',\n", + " '▁3',\n", + " '▁describe',\n", + " '▁europe',\n", + " '▁through',\n", + " '▁medic',\n", + " '▁home',\n", + " 'ug',\n", + " '▁african',\n", + " 'ind',\n", + " '▁prop',\n", + " 'by',\n", + " '▁thre',\n", + " 'ndon',\n", + " 'ative',\n", + " '▁sit',\n", + " ',000',\n", + " '▁high',\n", + " '▁secret',\n", + " '▁night',\n", + " '▁fr',\n", + " 'health',\n", + " '▁call',\n", + " '▁north',\n", + " '▁mean',\n", + " '▁seem',\n", + " '▁lea',\n", + " 'tire',\n", + " 'lway',\n", + " 'ness',\n", + " '▁rhodesia',\n", + " '▁195',\n", + " '▁hand',\n", + " '▁federa',\n", + " '▁place',\n", + " 'round',\n", + " '▁far',\n", + " 'ship',\n", + " 'low',\n", + " '▁exa',\n", + " 'ip',\n", + " '▁change',\n", + " '▁patient',\n", + " '▁lord',\n", + " '▁arriv',\n", + " '▁trem',\n", + " '▁pear',\n", + " 'ring',\n", + " '▁near',\n", + " 'land',\n", + " 'ress',\n", + " '▁clo',\n", + " '▁did',\n", + " 'later',\n", + " 'ius',\n", + " 'tho',\n", + " 'well',\n", + " '7',\n", + " '▁enjoy',\n", + " '▁success',\n", + " '▁appear',\n", + " '▁personal',\n", + " '▁move',\n", + " '▁treat',\n", + " '▁better',\n", + " 'case',\n", + " '▁never',\n", + " '▁look',\n", + " 'ture',\n", + " 'way',\n", + " 'mark',\n", + " 'ought',\n", + " 'tu',\n", + " '▁hu',\n", + " 'twe',\n", + " '▁foreign',\n", + " '▁number',\n", + " '▁fall',\n", + " '▁month',\n", + " '▁young',\n", + " 'ound',\n", + " '▁find',\n", + " '▁american',\n", + " '▁thought',\n", + " '▁state',\n", + " 'ilit',\n", + " '▁cit',\n", + " '▁side',\n", + " '▁centr',\n", + " 'lace',\n", + " 'ash',\n", + " '▁old',\n", + " 'rome',\n", + " '▁subject',\n", + " '▁support',\n", + " '▁world',\n", + " '▁word',\n", + " 'different',\n", + " '▁report',\n", + " '1',\n", + " '▁tru',\n", + " '▁sent',\n", + " '▁best',\n", + " '▁symp',\n", + " '▁just',\n", + " '▁richards',\n", + " '▁produc',\n", + " '▁left',\n", + " '▁clear',\n", + " '▁power',\n", + " 'ber',\n", + " '▁few',\n", + " 'solut',\n", + " 'tack',\n", + " '▁right',\n", + " '▁mar',\n", + " '▁might',\n", + " '3',\n", + " 'not',\n", + " '▁full',\n", + " '▁union',\n", + " '▁base',\n", + " '▁head',\n", + " 'ealth',\n", + " 'many',\n", + " '▁happen',\n", + " '▁mauro',\n", + " 'vision',\n", + " '▁inter',\n", + " '▁ye',\n", + " 'where',\n", + " '▁sail',\n", + " '▁past',\n", + " '▁your',\n", + " '▁want',\n", + " '▁stor',\n", + " 'went',\n", + " '▁eas',\n", + " '▁rem',\n", + " '▁nuclear',\n", + " '▁small',\n", + " '▁discuss',\n", + " 'spect',\n", + " '▁girl',\n", + " '▁view',\n", + " '▁south',\n", + " '▁mod',\n", + " 'elect',\n", + " '▁usual',\n", + " 'evi',\n", + " 'peration',\n", + " '▁engl',\n", + " 'train',\n", + " '▁mind',\n", + " 'acle',\n", + " 'riv',\n", + " 'side',\n", + " 'aris',\n", + " '▁event',\n", + " '▁lei',\n", + " '▁favour',\n", + " '▁problem',\n", + " '▁large',\n", + " 'ordina',\n", + " '▁feel',\n", + " 'search',\n", + " '▁wife',\n", + " '▁remain',\n", + " 'ready',\n", + " 'stone',\n", + " '▁cla',\n", + " '▁stand',\n", + " '▁big',\n", + " '▁bus',\n", + " 'ox',\n", + " '▁four',\n", + " 'port',\n", + " '▁main',\n", + " '▁nor',\n", + " '▁va',\n", + " 'mann',\n", + " '▁pot',\n", + " 'rry',\n", + " '▁fan',\n", + " '▁fig',\n", + " '▁attempt',\n", + " '▁follow',\n", + " '▁particular',\n", + " '▁church',\n", + " '▁wonder',\n", + " '▁doctor',\n", + " '▁control',\n", + " '▁half',\n", + " '▁politic',\n", + " 'ciple',\n", + " '▁trans',\n", + " '▁extra',\n", + " '▁start',\n", + " '▁death',\n", + " '▁tell',\n", + " '▁eye',\n", + " '▁own',\n", + " '▁allow',\n", + " '▁offer',\n", + " '▁quit',\n", + " '▁easi',\n", + " 'ular',\n", + " '▁che',\n", + " '▁cri',\n", + " 'bit',\n", + " '▁bar',\n", + " 'ption',\n", + " 'rate',\n", + " '▁system',\n", + " 'crochet',\n", + " '▁certain',\n", + " '▁taste',\n", + " '▁method',\n", + " '▁course',\n", + " '▁leader',\n", + " 'million',\n", + " '▁19',\n", + " '▁fu',\n", + " '▁help',\n", + " 'lady',\n", + " 'super',\n", + " 'date',\n", + " '▁fact',\n", + " '▁amo',\n", + " '▁sho',\n", + " 'nson',\n", + " 'lin',\n", + " '2',\n", + " '▁advi',\n", + " 'mitt',\n", + " '▁line',\n", + " 'effect',\n", + " '▁school',\n", + " '▁doubt',\n", + " '▁consult',\n", + " '▁answer',\n", + " 'special',\n", + " '▁burde',\n", + " '▁room',\n", + " '▁probabl',\n", + " '▁delight',\n", + " '▁bank',\n", + " '▁six',\n", + " '▁brown',\n", + " '▁studie',\n", + " '▁human',\n", + " '▁committee',\n", + " 'cial',\n", + " '▁creat',\n", + " '▁distin',\n", + " '▁cut',\n", + " '▁mother',\n", + " 'morr',\n", + " 'ground',\n", + " '▁free',\n", + " '▁five',\n", + " '▁arm',\n", + " '▁pol',\n", + " 'done',\n", + " '▁late',\n", + " '▁basi',\n", + " 'nato',\n", + " '9',\n", + " '▁econom',\n", + " '▁opposit',\n", + " '▁public',\n", + " '▁chief',\n", + " '▁become',\n", + " '▁record',\n", + " '▁peace',\n", + " '▁consider',\n", + " '▁understand',\n", + " '▁numer',\n", + " '▁slight',\n", + " '▁turn',\n", + " '▁duke',\n", + " '▁homo',\n", + " '▁grow',\n", + " '▁grant',\n", + " '▁office',\n", + " 'msel',\n", + " 'ainful',\n", + " '▁famil',\n", + " '▁ob',\n", + " '▁advise',\n", + " '▁cr',\n", + " '▁love',\n", + " 'ches',\n", + " '▁tea',\n", + " 'tist',\n", + " 'ivate',\n", + " '▁america',\n", + " 'aff',\n", + " 'author',\n", + " '▁gaitskell',\n", + " '▁industry',\n", + " '▁john',\n", + " '▁mistake',\n", + " '▁pleasant',\n", + " '▁temperature',\n", + " '▁difficult',\n", + " '▁brief',\n", + " '▁defence',\n", + " '▁gentle',\n", + " '▁piece',\n", + " '▁argu',\n", + " '▁irrita',\n", + " '▁berlin',\n", + " '▁father',\n", + " '▁cult',\n", + " '▁despa',\n", + " '▁market',\n", + " '▁white',\n", + " '▁negotiation',\n", + " '▁energ',\n", + " '▁fear',\n", + " '▁english',\n", + " '▁tele',\n", + " '▁interest',\n", + " 'same',\n", + " 'body',\n", + " '▁chap',\n", + " '▁table',\n", + " '▁mass',\n", + " '▁press',\n", + " '▁spi',\n", + " 'lay',\n", + " 'hall',\n", + " 'most',\n", + " '▁journ',\n", + " '▁diploma',\n", + " '▁expect',\n", + " '▁1960',\n", + " '▁join',\n", + " '▁provide',\n", + " '▁russell',\n", + " '▁speak',\n", + " '▁decide',\n", + " '▁complete',\n", + " '▁senator',\n", + " '▁tax',\n", + " '▁earth',\n", + " '▁colour',\n", + " 'armament',\n", + " '▁sevent',\n", + " '▁learn',\n", + " '▁honey',\n", + " '▁militar',\n", + " '▁claim',\n", + " '▁chamb',\n", + " '▁master',\n", + " '▁peel',\n", + " '▁firm',\n", + " 'eason',\n", + " '▁trade',\n", + " '▁washing',\n", + " '▁statement',\n", + " 'range',\n", + " '▁rol',\n", + " 'guis',\n", + " 'hei',\n", + " 'ago',\n", + " '▁pla',\n", + " '▁cam',\n", + " 'eopa',\n", + " '▁africa',\n", + " '▁convention',\n", + " '▁council',\n", + " '▁decision',\n", + " '▁experience',\n", + " '▁increasing',\n", + " '▁majorit',\n", + " '▁suffer',\n", + " '▁suggest',\n", + " '▁position',\n", + " '▁resource',\n", + " '▁soldier',\n", + " 'temper',\n", + " '▁general',\n", + " '▁financi',\n", + " '▁express',\n", + " '▁short',\n", + " '▁compan',\n", + " 'field',\n", + " '▁think',\n", + " 'cross',\n", + " '▁odd',\n", + " '▁1830',\n", + " '▁reason',\n", + " '▁hard',\n", + " '▁fright',\n", + " '▁final',\n", + " '▁thread',\n", + " 'heart',\n", + " '▁nea',\n", + " '▁assu',\n", + " '▁step',\n", + " '▁countr',\n", + " 'did',\n", + " '▁infa',\n", + " 'struct',\n", + " '▁question',\n", + " 'advantage',\n", + " '▁atmospher',\n", + " '▁screw',\n", + " '▁square',\n", + " '▁welensky',\n", + " '▁cinema',\n", + " '▁weapon',\n", + " '▁importan',\n", + " '▁apart',\n", + " '▁prepar',\n", + " '▁payment',\n", + " '▁present',\n", + " '▁persever',\n", + " 'simpl',\n", + " '▁idea',\n", + " '▁expl',\n", + " '▁chance',\n", + " '▁fast',\n", + " '▁notice',\n", + " 'rade',\n", + " '▁giv',\n", + " '▁leave',\n", + " 'finger',\n", + " 'eppe',\n", + " '▁gla',\n", + " '▁psycholog',\n", + " '▁robertson',\n", + " '▁accept',\n", + " '▁improv',\n", + " '▁philip',\n", + " '▁technical',\n", + " '▁trouble',\n", + " '▁premier',\n", + " '▁progress',\n", + " 'minute',\n", + " '▁scot',\n", + " '▁couple',\n", + " '▁concern',\n", + " '▁schem',\n", + " '▁weaver',\n", + " '▁inform',\n", + " '▁cap',\n", + " 'stead',\n", + " '▁heav',\n", + " '▁propos',\n", + " '▁order',\n", + " 'ensure',\n", + " 'worth',\n", + " '▁former',\n", + " '▁sense',\n", + " '▁clu',\n", + " 'nder',\n", + " 'easing',\n", + " '▁possib',\n", + " 'ulate',\n", + " 'polit',\n", + " 'equa',\n", + " 'communist',\n", + " '▁adenauer',\n", + " '▁delaney',\n", + " '▁demonstrat',\n", + " '▁independen',\n", + " '▁welcome',\n", + " '▁speech',\n", + " '▁includ',\n", + " '▁oppose',\n", + " 'timate',\n", + " '▁picture',\n", + " '▁share',\n", + " '▁result',\n", + " '▁receiv',\n", + " '▁garden',\n", + " '▁appeal',\n", + " '▁chair',\n", + " '▁keep',\n", + " '▁regard',\n", + " '▁chines',\n", + " '▁sister',\n", + " '▁prince',\n", + " '▁scene',\n", + " '▁spend',\n", + " '▁congo',\n", + " '▁issue',\n", + " '▁authorit',\n", + " '▁memor',\n", + " '▁scar',\n", + " '▁moral',\n", + " '▁matter',\n", + " 'aught',\n", + " '▁wait',\n", + " 'street',\n", + " '▁concept',\n", + " '▁destructi',\n", + " 'sol',\n", + " 'committe',\n", + " 'house',\n", + " 'older',\n", + " 'ference',\n", + " '▁adjust',\n", + " '▁associat',\n", + " '▁campaign',\n", + " '▁conservative',\n", + " '▁immediate',\n", + " '▁industri',\n", + " '▁james',\n", + " '▁lucas',\n", + " '▁practic',\n", + " '▁publish',\n", + " '▁purpose',\n", + " '▁queer',\n", + " '▁representative',\n", + " '▁relief',\n", + " '▁detail',\n", + " '▁official',\n", + " '▁faith',\n", + " '▁design',\n", + " '▁sacr',\n", + " '▁household',\n", + " '▁variet',\n", + " '▁village',\n", + " '▁foot',\n", + " '▁russia',\n", + " '▁fifty',\n", + " '▁block',\n", + " '▁inevitab',\n", + " '▁prevent',\n", + " '▁belgi',\n", + " '▁type',\n", + " '▁cheap',\n", + " '▁protest',\n", + " '▁along',\n", + " '▁demand',\n", + " '▁earn',\n", + " '▁sick',\n", + " '▁eviden',\n", + " '▁someone',\n", + " '▁peers',\n", + " '▁suitabl',\n", + " '▁sympath',\n", + " 'brid',\n", + " '▁discr',\n", + " 'flu',\n", + " 'bomb',\n", + " 'rator',\n", + " 'coming',\n", + " 'famili',\n", + " '▁183',\n", + " 'employ',\n", + " '▁catherine',\n", + " '▁correspond',\n", + " '▁material',\n", + " '▁rubb',\n", + " '▁measure',\n", + " '▁sharp',\n", + " '▁wicki',\n", + " '▁marriage',\n", + " '▁size',\n", + " '▁liberal',\n", + " '▁level',\n", + " '▁surround',\n", + " '▁delegat',\n", + " 'script',\n", + " '▁suppl',\n", + " '▁reserve',\n", + " '▁refus',\n", + " '▁gift',\n", + " '▁promis',\n", + " '▁safe',\n", + " '▁exist',\n", + " '▁alto',\n", + " '▁wide',\n", + " '▁arrang',\n", + " '▁relations',\n", + " 'shab',\n", + " '▁intelligen',\n", + " 'emphasi',\n", + " '▁advertisement',\n", + " '▁beautiful',\n", + " '+',\n", + " '?',\n", + " 'q',\n", + " 'j',\n", + " ...]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processor.tokens" + ] + }, + { + "cell_type": "code", "execution_count": 8, "id": "fb0afccf", "metadata": {}, @@ -262,6 +1361,1027 @@ }, { "cell_type": "code", + "execution_count": 14, + "id": "fb600959", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'!': ['▁', '!'],\n", + " '\"': ['▁', '\"'],\n", + " '&': ['▁', '&'],\n", + " \"'\": [\"▁'\"],\n", + " \"'30s\": [\"▁'\", '3', '0', 's'],\n", + " \"'61\": [\"▁'\", '6', '1'],\n", + " \"'d\": [\"▁'\", 'd'],\n", + " \"'ll\": [\"▁'\", 'll'],\n", + " \"'m\": [\"▁'\", 'm'],\n", + " \"'re\": [\"▁'\", 're'],\n", + " \"'s\": [\"▁'\", 's'],\n", + " \"'ve\": [\"▁'\", 've'],\n", + " '(': ['▁', '('],\n", + " ')': ['▁', ')'],\n", + " '+2.8': ['▁', '+', '2', '.', '8'],\n", + " '+3.6': ['▁', '+', '3', '.', '6'],\n", + " ',': ['▁', ','],\n", + " '-': ['▁-'],\n", + " '-2.6': ['▁-', '2', '.', '6'],\n", + " '-5.4': ['▁-', '5', '.', '4'],\n", + " '.': ['▁.'],\n", + " '...': ['▁.', '..'],\n", + " '0m': ['▁', '0', 'm'],\n", + " '1': ['▁1'],\n", + " '1,157': ['▁1', ',', '1', '5', '7'],\n", + " '1.8': ['▁1', '.', '8'],\n", + " '1/2': ['▁1', '/', '2'],\n", + " '1/2-in.-long': ['▁1', '/', '2', '-', 'in', '.', '-', 'lo', 'ng'],\n", + " '1/4': ['▁1', '/', '4'],\n", + " '10': ['▁1', '0'],\n", + " '10,000': ['▁1', '0', ',000'],\n", + " '100': ['▁1', '0', '0'],\n", + " '100,000,000': ['▁1', '0', '0', ',000', ',000'],\n", + " '11': ['▁1', '1'],\n", + " '12': ['▁1', '2'],\n", + " '125': ['▁1', '2', '5'],\n", + " '13': ['▁1', '3'],\n", + " '13,000': ['▁1', '3', ',000'],\n", + " '14': ['▁1', '4'],\n", + " '15': ['▁1', '5'],\n", + " '15,000,000': ['▁1', '5', ',000', ',000'],\n", + " '15-17': ['▁1', '5', '-', '1', '7'],\n", + " '15-year-olds': ['▁1', '5', '-', 'y', 'e', 'ar', '-', 'o', 'ld', 's'],\n", + " '150,000,000': ['▁1', '5', '0', ',000', ',000'],\n", + " '16': ['▁1', '6'],\n", + " '160': ['▁1', '6', '0'],\n", + " '163,000,000': ['▁1', '6', '3', ',000', ',000'],\n", + " '167': ['▁1', '6', '7'],\n", + " '17': ['▁1', '7'],\n", + " '18.1': ['▁1', '8', '.', '1'],\n", + " '1830': ['▁1830'],\n", + " \"1830's\": ['▁1830', \"'\", 's'],\n", + " '1834': ['▁183', '4'],\n", + " '1897': ['▁1', '8', '9', '7'],\n", + " '19': ['▁19'],\n", + " '19.5': ['▁19', '.', '5'],\n", + " '1910': ['▁19', '1', '0'],\n", + " '1913': ['▁19', '1', '3'],\n", + " '1914': ['▁19', '1', '4'],\n", + " '1914-18': ['▁19', '1', '4', '-', '1', '8'],\n", + " '1920': ['▁19', '2', '0'],\n", + " '1931': ['▁19', '3', '1'],\n", + " '1932': ['▁19', '3', '2'],\n", + " '1934': ['▁19', '3', '4'],\n", + " '1939': ['▁19', '3', '9'],\n", + " '1943': ['▁19', '4', '3'],\n", + " '1944': ['▁19', '4', '4'],\n", + " '1950': ['▁195', '0'],\n", + " '1951': ['▁195', '1'],\n", + " '1954': ['▁195', '4'],\n", + " '1956': ['▁195', '6'],\n", + " '1957': ['▁195', '7'],\n", + " '1958': ['▁195', '8'],\n", + " '1959': ['▁195', '9'],\n", + " '1960': ['▁1960'],\n", + " '1960s': ['▁1960', 's'],\n", + " '1961': ['▁19', '6', '1'],\n", + " '1963': ['▁19', '6', '3'],\n", + " '19th': ['▁19', 'th'],\n", + " '1superceded': ['▁1', 'super', 'c', 'ed', 'ed'],\n", + " \"1tho'\": ['▁1', 'tho', \"'\"],\n", + " '2': ['▁2'],\n", + " '2,000': ['▁2', ',000'],\n", + " '2,415,000,000': ['▁2', ',', '4', '1', '5', ',000', ',000'],\n", + " '20': ['▁2', '0'],\n", + " '20-month-old': ['▁2', '0', '-', 'm', 'on', 'th', '-', 'o', 'ld'],\n", + " '200': ['▁2', '0', '0'],\n", + " '20th-century': ['▁2', '0', 'th', '-', 'c', 'ent', 'ur', 'y'],\n", + " '210million': ['▁2', '1', '0', 'million'],\n", + " '22': ['▁2', '2'],\n", + " '23.1': ['▁2', '3', '.', '1'],\n", + " '24-strong': ['▁2', '4', '-', 'st', 'ro', 'ng'],\n", + " '25': ['▁2', '5'],\n", + " '27': ['▁2', '7'],\n", + " '28.5': ['▁2', '8', '.', '5'],\n", + " '280,000': ['▁2', '8', '0', ',000'],\n", + " '287': ['▁2', '8', '7'],\n", + " '288': ['▁2', '8', '8'],\n", + " '2bhoys': ['▁2', 'b', 'h', 'o', 'y', 's'],\n", + " '2ole': ['▁2', 'o', 'le'],\n", + " '2skint': ['▁2', 's', 'k', 'in', 't'],\n", + " '3': ['▁3'],\n", + " '3,000': ['▁3', ',000'],\n", + " '3.6': ['▁3', '.', '6'],\n", + " '3/0': ['▁3', '/', '0'],\n", + " '30': ['▁3', '0'],\n", + " '30-minute': ['▁3', '0', '-', 'minute'],\n", + " '300,000': ['▁3', '0', '0', ',000'],\n", + " '32': ['▁3', '2'],\n", + " '34': ['▁3', '4'],\n", + " '35': ['▁3', '5'],\n", + " '357million': ['▁3', '5', '7', 'million'],\n", + " '36': ['▁3', '6'],\n", + " '37,000,000': ['▁3', '7', ',000', ',000'],\n", + " '37.2': ['▁3', '7', '.', '2'],\n", + " '38': ['▁3', '8'],\n", + " '4': ['▁', '4'],\n", + " '4.8': ['▁', '4', '.', '8'],\n", + " '40': ['▁', '4', '0'],\n", + " '400': ['▁', '4', '0', '0'],\n", + " '400,000': ['▁', '4', '0', '0', ',000'],\n", + " '43': ['▁', '4', '3'],\n", + " '450': ['▁', '4', '5', '0'],\n", + " '5': ['▁', '5'],\n", + " '5,000': ['▁', '5', ',000'],\n", + " '5.30': ['▁', '5', '.', '3', '0'],\n", + " '5/8': ['▁', '5', '/', '8'],\n", + " '50,000': ['▁', '5', '0', ',000'],\n", + " '500': ['▁', '5', '0', '0'],\n", + " '550,000': ['▁', '5', '5', '0', ',000'],\n", + " '58': ['▁', '5', '8'],\n", + " '6': ['▁', '6'],\n", + " '6,000': ['▁', '6', ',000'],\n", + " '60': ['▁', '6', '0'],\n", + " '600': ['▁', '6', '0', '0'],\n", + " '61-year-old': ['▁', '6', '1', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n", + " '6al': ['▁', '6', 'al'],\n", + " '6tic': ['▁', '6', 'tic'],\n", + " '7.42': ['▁', '7', '.', '4', '2'],\n", + " '70': ['▁', '7', '0'],\n", + " '70,000,000': ['▁', '7', '0', ',000', ',000'],\n", + " '707': ['▁', '7', '0', '7'],\n", + " '750': ['▁', '7', '5', '0'],\n", + " '8': ['▁', '8'],\n", + " '80': ['▁', '8', '0'],\n", + " '800': ['▁', '8', '0', '0'],\n", + " '800,000': ['▁', '8', '0', '0', ',000'],\n", + " '86': ['▁', '8', '6'],\n", + " '88': ['▁', '8', '8'],\n", + " '88-year-old': ['▁', '8', '8', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n", + " '89': ['▁', '8', '9'],\n", + " '89-year-old': ['▁', '8', '9', '-', 'y', 'e', 'ar', '-', 'o', 'ld'],\n", + " '90-day': ['▁', '9', '0', '-', 'day'],\n", + " '90-minute': ['▁', '9', '0', '-', 'minute'],\n", + " '91': ['▁', '9', '1'],\n", + " '950': ['▁', '9', '5', '0'],\n", + " '97.5': ['▁', '9', '7', '.', '5'],\n", + " ':': ['▁', ':'],\n", + " ';': ['▁', ';'],\n", + " '?': ['▁', '?'],\n", + " 'a': ['▁a'],\n", + " 'abandon': ['▁a', 'b', 'a', 'ndon'],\n", + " 'abandoned': ['▁a', 'b', 'a', 'ndon', 'ed'],\n", + " 'abashed': ['▁a', 'b', 'ash', 'ed'],\n", + " 'ability': ['▁a', 'b', 'ilit', 'y'],\n", + " 'able': ['▁', 'able'],\n", + " 'able-bodied': ['▁', 'able', '-', 'bo', 'd', 'i', 'ed'],\n", + " 'abolish': ['▁a', 'bo', 'l', 'ish'],\n", + " 'abolished': ['▁a', 'bo', 'l', 'ish', 'ed'],\n", + " 'abolition': ['▁a', 'bo', 'li', 'tion'],\n", + " 'abou': ['▁a', 'bo', 'u'],\n", + " 'about': ['▁about'],\n", + " 'about-': ['▁about', '-'],\n", + " 'above': ['▁a', 'bo', 've'],\n", + " 'abreast': ['▁a', 'b', 're', 'a', 'st'],\n", + " 'abroad': ['▁a', 'b', 'ro', 'ad'],\n", + " 'absence': ['▁a', 'b', 's', 'ence'],\n", + " 'absent': ['▁a', 'b', 's', 'ent'],\n", + " 'absolutely': ['▁a', 'b', 'solut', 'e', 'ly'],\n", + " 'abstraction': ['▁a', 'b', 'st', 'ra', 'c', 'tion'],\n", + " 'abundance': ['▁a', 'b', 'un', 'd', 'ance'],\n", + " 'ac-': ['▁a', 'c', '-'],\n", + " 'academic': ['▁a', 'c', 'ad', 'e', 'm', 'ic'],\n", + " 'accent': ['▁a', 'c', 'c', 'ent'],\n", + " 'accents': ['▁a', 'c', 'c', 'ent', 's'],\n", + " 'accept': ['▁accept'],\n", + " 'acceptable': ['▁accept', 'able'],\n", + " 'accepted': ['▁accept', 'ed'],\n", + " 'accepting': ['▁accept', 'ing'],\n", + " 'accessories': ['▁a', 'c', 'ce', 's', 's', 'or', 'ies'],\n", + " 'accident': ['▁a', 'c', 'c', 'i', 'd', 'ent'],\n", + " 'accidental': ['▁a', 'c', 'c', 'i', 'd', 'ent', 'al'],\n", + " 'accommodate': ['▁a', 'c', 'c', 'o', 'm', 'm', 'o', 'date'],\n", + " 'accommodation': ['▁a', 'c', 'c', 'o', 'm', 'm', 'o', 'd', 'ation'],\n", + " 'accompanied': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'i', 'ed'],\n", + " 'accompanist': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'is', 't'],\n", + " 'accompany': ['▁a', 'c', 'c', 'o', 'm', 'p', 'an', 'y'],\n", + " 'accomplished': ['▁a', 'c', 'c', 'o', 'm', 'p', 'l', 'ish', 'ed'],\n", + " 'accomplishments': ['▁a', 'c', 'c', 'o', 'm', 'p', 'l', 'ish', 'ment', 's'],\n", + " 'according': ['▁a', 'c', 'c', 'or', 'd', 'ing'],\n", + " 'account': ['▁a', 'c', 'c', 'o', 'un', 't'],\n", + " 'accountancy': ['▁a', 'c', 'c', 'o', 'un', 't', 'an', 'c', 'y'],\n", + " 'accra': ['▁a', 'c', 'c', 'ra'],\n", + " \"accra's\": ['▁a', 'c', 'c', 'ra', \"'\", 's'],\n", + " 'accuracy': ['▁a', 'c', 'cu', 'ra', 'c', 'y'],\n", + " 'accurate': ['▁a', 'c', 'cu', 'rate'],\n", + " 'accurately': ['▁a', 'c', 'cu', 'rate', 'ly'],\n", + " 'accused': ['▁a', 'c', 'cu', 's', 'ed'],\n", + " 'achieved': ['▁a', 'ch', 'ie', 'v', 'ed'],\n", + " 'achievement': ['▁a', 'ch', 'ie', 've', 'ment'],\n", + " 'acquaintance': ['▁a', 'c', 'q', 'u', 'ain', 't', 'ance'],\n", + " 'acquaintances': ['▁a', 'c', 'q', 'u', 'ain', 't', 'ance', 's'],\n", + " 'acres': ['▁a', 'c', 're', 's'],\n", + " 'across': ['▁a', 'cross'],\n", + " 'act': ['▁act'],\n", + " 'acting': ['▁act', 'ing'],\n", + " 'action': ['▁action'],\n", + " 'actions': ['▁action', 's'],\n", + " 'active': ['▁act', 'ive'],\n", + " 'activists': ['▁act', 'i', 'v', 'is', 't', 's'],\n", + " 'activities': ['▁act', 'i', 'v', 'it', 'ies'],\n", + " 'activity': ['▁act', 'i', 'v', 'it', 'y'],\n", + " 'acton': ['▁act', 'on'],\n", + " 'actor': ['▁act', 'or'],\n", + " 'actress': ['▁act', 'ress'],\n", + " 'acts': ['▁act', 's'],\n", + " 'actual': ['▁act', 'ual'],\n", + " 'actually': ['▁act', 'ual', 'ly'],\n", + " 'adamafio': ['▁a', 'd', 'a', 'm', 'a', 'f', 'i', 'o'],\n", + " 'adaptation': ['▁a', 'd', 'a', 'p', 't', 'ation'],\n", + " 'adapted': ['▁a', 'd', 'a', 'p', 't', 'ed'],\n", + " 'adapting': ['▁a', 'd', 'a', 'p', 't', 'ing'],\n", + " 'add': ['▁add'],\n", + " 'added': ['▁add', 'ed'],\n", + " 'adding': ['▁add', 'ing'],\n", + " 'addition': ['▁add', 'it', 'ion'],\n", + " 'additions': ['▁add', 'it', 'ion', 's'],\n", + " 'address': ['▁add', 'ress'],\n", + " 'addressed': ['▁add', 'ress', 'ed'],\n", + " 'addresses': ['▁add', 'ress', 'es'],\n", + " 'addressing': ['▁add', 'ress', 'ing'],\n", + " 'adenauer': ['▁adenauer'],\n", + " \"adenauer's\": ['▁adenauer', \"'\", 's'],\n", + " 'adequate': ['▁a', 'd', 'equa', 'te'],\n", + " 'adhem': ['▁a', 'd', 'h', 'e', 'm'],\n", + " 'adjust': ['▁adjust'],\n", + " 'adjustment': ['▁adjust', 'ment'],\n", + " 'administration': ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ation'],\n", + " \"administration's\": ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ation', \"'\", 's'],\n", + " 'administrative': ['▁a', 'd', 'm', 'in', 'is', 't', 'r', 'ative'],\n", + " 'admiralty': ['▁a', 'd', 'm', 'ir', 'al', 'ty'],\n", + " 'admire': ['▁a', 'd', 'm', 'ir', 'e'],\n", + " 'admit': ['▁a', 'd', 'm', 'it'],\n", + " 'admitted': ['▁a', 'd', 'mitt', 'ed'],\n", + " 'admitting': ['▁a', 'd', 'mitt', 'ing'],\n", + " 'adopted': ['▁a', 'd', 'op', 't', 'ed'],\n", + " 'adoption': ['▁a', 'd', 'o', 'ption'],\n", + " 'adult': ['▁a', 'd', 'ul', 't'],\n", + " 'advance': ['▁a', 'd', 'v', 'ance'],\n", + " 'advanced': ['▁a', 'd', 'v', 'ance', 'd'],\n", + " 'advancing': ['▁a', 'd', 'v', 'an', 'c', 'ing'],\n", + " 'advantage': ['▁', 'advantage'],\n", + " 'advantages': ['▁', 'advantage', 's'],\n", + " 'advertisement': ['▁advertisement'],\n", + " 'advertisements': ['▁advertisement', 's'],\n", + " 'advice': ['▁advi', 'ce'],\n", + " 'advisability': ['▁advi', 's', 'a', 'b', 'ilit', 'y'],\n", + " 'advise': ['▁advise'],\n", + " 'advised': ['▁advise', 'd'],\n", + " 'advisers': ['▁advise', 'r', 's'],\n", + " 'advocate': ['▁a', 'd', 'v', 'o', 'c', 'at', 'e'],\n", + " 'af-': ['▁a', 'f', '-'],\n", + " 'affairs': ['▁', 'aff', 'a', 'ir', 's'],\n", + " 'affected': ['▁', 'aff', 'e', 'ct', 'ed'],\n", + " 'affection': ['▁', 'aff', 'e', 'c', 'tion'],\n", + " 'affilia-': ['▁', 'aff', 'il', 'i', 'a', '-'],\n", + " 'affiliations': ['▁', 'aff', 'il', 'i', 'ation', 's'],\n", + " 'affluence': ['▁a', 'f', 'flu', 'ence'],\n", + " 'affluent': ['▁a', 'f', 'flu', 'ent'],\n", + " 'afraid': ['▁a', 'f', 'r', 'aid'],\n", + " 'africa': ['▁africa'],\n", + " \"africa's\": ['▁africa', \"'\", 's'],\n", + " 'african': ['▁african'],\n", + " 'africans': ['▁african', 's'],\n", + " 'after': ['▁after'],\n", + " 'afternoon': ['▁after', 'n', 'o', 'on'],\n", + " 'afterwards': ['▁after', 'ward', 's'],\n", + " 'again': ['▁again'],\n", + " 'against': ['▁again', 'st'],\n", + " 'age': ['▁', 'age'],\n", + " 'age-structure': ['▁', 'age', '-', 'struct', 'ure'],\n", + " 'aged': ['▁a', 'g', 'ed'],\n", + " 'ageing': ['▁', 'age', 'ing'],\n", + " 'agent': ['▁a', 'g', 'ent'],\n", + " 'agents': ['▁a', 'g', 'ent', 's'],\n", + " 'ages': ['▁', 'age', 's'],\n", + " 'agitation': ['▁a', 'g', 'it', 'ation'],\n", + " 'ago': ['▁', 'ago'],\n", + " 'agree': ['▁agree'],\n", + " 'agreed': ['▁agree', 'd'],\n", + " 'agreement': ['▁agree', 'ment'],\n", + " 'agreements': ['▁agree', 'ment', 's'],\n", + " 'agriculture': ['▁a', 'g', 'ri', 'c', 'ul', 'ture'],\n", + " 'ahead': ['▁a', 'h', 'e', 'ad'],\n", + " 'aid': ['▁', 'aid'],\n", + " 'aide': ['▁', 'aid', 'e'],\n", + " 'aided': ['▁', 'aid', 'ed'],\n", + " 'aides': ['▁', 'aid', 'es'],\n", + " 'aim': ['▁a', 'i', 'm'],\n", + " 'aimed': ['▁a', 'i', 'm', 'ed'],\n", + " 'aiming': ['▁a', 'i', 'm', 'ing'],\n", + " 'air': ['▁a', 'ir'],\n", + " 'aircraft': ['▁a', 'ir', 'c', 'ra', 'f', 't'],\n", + " 'aired': ['▁a', 'ir', 'ed'],\n", + " \"airliner's\": ['▁a', 'ir', 'lin', 'er', \"'\", 's'],\n", + " 'airmen': ['▁a', 'ir', 'm', 'en'],\n", + " 'airport': ['▁a', 'ir', 'port'],\n", + " 'akin': ['▁', 'ak', 'in'],\n", + " \"aladdin's\": ['▁', 'al', 'ad', 'd', 'in', \"'\", 's'],\n", + " 'alan': ['▁', 'al', 'an'],\n", + " 'alarmed': ['▁', 'al', 'ar', 'm', 'ed'],\n", + " 'alas': ['▁', 'al', 'a', 's'],\n", + " 'alcoholic': ['▁', 'al', 'c', 'o', 'h', 'o', 'l', 'ic'],\n", + " 'algeria': ['▁', 'al', 'ge', 'ri', 'a'],\n", + " 'alike': ['▁a', 'li', 'ke'],\n", + " 'alive': ['▁', 'al', 'ive'],\n", + " 'all': ['▁all'],\n", + " 'all-regular': ['▁all', '-', 're', 'g', 'ular'],\n", + " 'alleged': ['▁all', 'e', 'g', 'ed'],\n", + " 'allen': ['▁all', 'en'],\n", + " 'alleviation': ['▁all', 'evi', 'ation'],\n", + " 'alley': ['▁all', 'e', 'y'],\n", + " 'alliance': ['▁all', 'i', 'ance'],\n", + " 'alliances': ['▁all', 'i', 'ance', 's'],\n", + " 'allied': ['▁all', 'i', 'ed'],\n", + " 'allies': ['▁all', 'ies'],\n", + " 'allow': ['▁allow'],\n", + " 'allowance': ['▁allow', 'ance'],\n", + " 'allowances': ['▁allow', 'ance', 's'],\n", + " 'allowed': ['▁allow', 'ed'],\n", + " 'allowing': ['▁allow', 'ing'],\n", + " 'ally': ['▁', 'ally'],\n", + " 'almost': ['▁', 'al', 'most'],\n", + " 'alone': ['▁a', 'lo', 'ne'],\n", + " 'along': ['▁along'],\n", + " 'alongside': ['▁along', 'side'],\n", + " 'already': ['▁', 'al', 'ready'],\n", + " 'also': ['▁a', 'lso'],\n", + " 'alter': ['▁', 'al', 'ter'],\n", + " 'alternative': ['▁', 'al', 'ter', 'n', 'ative'],\n", + " 'alternatively': ['▁', 'al', 'ter', 'n', 'ative', 'ly'],\n", + " 'alternatives': ['▁', 'al', 'ter', 'n', 'ative', 's'],\n", + " 'although': ['▁', 'al', 'th', 'ough'],\n", + " 'altogether': ['▁alto', 'ge', 'ther'],\n", + " 'altos': ['▁alto', 's'],\n", + " 'always': ['▁a', 'lway', 's'],\n", + " 'am': ['▁a', 'm'],\n", + " 'amateur': ['▁a', 'm', 'at', 'e', 'ur'],\n", + " 'amazed': ['▁a', 'm', 'a', 'z', 'ed'],\n", + " 'amazing': ['▁a', 'm', 'a', 'z', 'ing'],\n", + " 'ambassador': ['▁a', 'm', 'b', 'a', 's', 's', 'ad', 'or'],\n", + " 'amber': ['▁a', 'mber'],\n", + " 'ambition': ['▁a', 'm', 'bit', 'ion'],\n", + " 'ambitious': ['▁a', 'm', 'bit', 'i', 'ous'],\n", + " 'ambulance': ['▁a', 'm', 'b', 'ul', 'ance'],\n", + " 'ambulances': ['▁a', 'm', 'b', 'ul', 'ance', 's'],\n", + " 'america': ['▁america'],\n", + " \"america's\": ['▁america', \"'\", 's'],\n", + " 'american': ['▁american'],\n", + " 'american-born': ['▁american', '-', 'b', 'orn'],\n", + " 'americans': ['▁american', 's'],\n", + " 'ammunition': ['▁a', 'm', 'm', 'un', 'it', 'ion'],\n", + " 'among': ['▁amo', 'ng'],\n", + " 'amount': ['▁amo', 'un', 't'],\n", + " 'ample': ['▁a', 'm', 'ple'],\n", + " 'amusement': ['▁a', 'm', 'us', 'e', 'ment'],\n", + " 'amusing': ['▁a', 'm', 'us', 'ing'],\n", + " 'an': ['▁an'],\n", + " 'analogy': ['▁an', 'al', 'o', 'g', 'y'],\n", + " 'analysed': ['▁an', 'al', 'y', 's', 'ed'],\n", + " 'anchor': ['▁an', 'ch', 'or'],\n", + " 'ancient': ['▁an', 'c', 'i', 'ent'],\n", + " 'and': ['▁and'],\n", + " 'andrei': ['▁and', 're', 'i'],\n", + " 'andrew': ['▁and', 're', 'w'],\n", + " 'anecdotal': ['▁an', 'e', 'c', 'd', 'o', 't', 'al'],\n", + " 'angel': ['▁a', 'ng', 'el'],\n", + " 'angeles': ['▁an', 'ge', 'le', 's'],\n", + " 'angelo': ['▁an', 'ge', 'lo'],\n", + " 'anger': ['▁a', 'ng', 'er'],\n", + " 'anglais': ['▁a', 'ng', 'l', 'a', 'is'],\n", + " 'angle': ['▁a', 'ng', 'le'],\n", + " 'anglesey': ['▁anglesey'],\n", + " \"anglesey's\": ['▁anglesey', \"'\", 's'],\n", + " 'anglesey-road': ['▁anglesey', '-', 'ro', 'ad'],\n", + " 'angola': ['▁a', 'ng', 'o', 'l', 'a'],\n", + " 'angry': ['▁a', 'ng', 'ry'],\n", + " 'ann': ['▁an', 'n'],\n", + " 'anna': ['▁an', 'n', 'a'],\n", + " 'announced': ['▁an', 'n', 'o', 'un', 'c', 'ed'],\n", + " 'announcement': ['▁an', 'n', 'o', 'un', 'ce', 'ment'],\n", + " 'announcing': ['▁an', 'n', 'o', 'un', 'c', 'ing'],\n", + " 'annual': ['▁an', 'n', 'ual'],\n", + " 'another': ['▁an', 'other'],\n", + " 'answer': ['▁answer'],\n", + " 'answered': ['▁answer', 'ed'],\n", + " 'answering': ['▁answer', 'ing'],\n", + " 'antagonism': ['▁', 'ant', 'ago', 'n', 'ism'],\n", + " 'anthony': ['▁an', 'th', 'on', 'y'],\n", + " 'anti-apartheid': ['▁', 'ant', 'i', '-', 'a', 'p', 'ar', 't', 'hei', 'd'],\n", + " 'anti-bomb': ['▁', 'ant', 'i', '-', 'bomb'],\n", + " 'anti-german': ['▁', 'ant', 'i', '-', 'g', 'er', 'man'],\n", + " 'anti-nato': ['▁', 'ant', 'i', '-', 'nato'],\n", + " 'anti-negro': ['▁', 'ant', 'i', '-', 'ne', 'g', 'ro'],\n", + " 'anti-nuclear': ['▁', 'ant', 'i', '-', 'n', 'u', 'c', 'le', 'ar'],\n", + " 'anti-soviet': ['▁', 'ant', 'i', '-', 's', 'o', 'v', 'i', 'et'],\n", + " 'anti-tory': ['▁', 'ant', 'i', '-', 'tory'],\n", + " 'anticipation': ['▁an', 'tic', 'ip', 'ation'],\n", + " 'antonioni': ['▁an', 'ton', 'ion', 'i'],\n", + " \"antonioni's\": ['▁an', 'ton', 'ion', 'i', \"'\", 's'],\n", + " 'any': ['▁any'],\n", + " 'any-': ['▁any', '-'],\n", + " 'anybody': ['▁any', 'body'],\n", + " \"anybody's\": ['▁any', 'body', \"'\", 's'],\n", + " 'anyone': ['▁any', 'on', 'e'],\n", + " 'anything': ['▁any', 'thing'],\n", + " 'apart': ['▁apart'],\n", + " 'apartheid': ['▁apart', 'hei', 'd'],\n", + " 'apathetic': ['▁a', 'p', 'at', 'h', 'e', 'tic'],\n", + " 'apathy': ['▁a', 'p', 'at', 'h', 'y'],\n", + " 'apex': ['▁a', 'p', 'e', 'x'],\n", + " 'apocalypse': ['▁a', 'p', 'o', 'c', 'al', 'y', 'p', 'se'],\n", + " 'apologising': ['▁a', 'p', 'o', 'lo', 'g', 'is', 'ing'],\n", + " 'appalled': ['▁', 'app', 'al', 'led'],\n", + " 'appalling': ['▁', 'app', 'al', 'l', 'ing'],\n", + " 'apparatus': ['▁', 'app', 'ar', 'at', 'us'],\n", + " 'apparent': ['▁', 'app', 'ar', 'ent'],\n", + " 'apparently': ['▁', 'app', 'ar', 'ent', 'ly'],\n", + " 'appeal': ['▁appeal'],\n", + " 'appealing': ['▁appeal', 'ing'],\n", + " 'appeals': ['▁appeal', 's'],\n", + " 'appear': ['▁appear'],\n", + " 'appearance': ['▁appear', 'ance'],\n", + " 'appeared': ['▁appear', 'ed'],\n", + " 'appears': ['▁appear', 's'],\n", + " 'appeasement': ['▁', 'app', 'e', 'a', 'se', 'ment'],\n", + " 'applauding': ['▁', 'app', 'l', 'a', 'ud', 'ing'],\n", + " 'appliances': ['▁', 'app', 'li', 'ance', 's'],\n", + " 'application': ['▁', 'app', 'l', 'ic', 'ation'],\n", + " 'applications': ['▁', 'app', 'l', 'ic', 'ation', 's'],\n", + " 'applied': ['▁', 'app', 'li', 'ed'],\n", + " 'apply': ['▁', 'app', 'ly'],\n", + " 'appointed': ['▁', 'app', 'o', 'in', 't', 'ed'],\n", + " 'appointment': ['▁', 'app', 'o', 'in', 't', 'ment'],\n", + " 'appreciable': ['▁', 'app', 're', 'c', 'i', 'able'],\n", + " 'appreciably': ['▁', 'app', 're', 'c', 'i', 'abl', 'y'],\n", + " 'appreciated': ['▁', 'app', 're', 'c', 'i', 'at', 'ed'],\n", + " 'apprenticeships': ['▁', 'app', 'ren', 'tic', 'e', 'ship', 's'],\n", + " 'approach': ['▁', 'app', 'ro', 'a', 'ch'],\n", + " 'approached': ['▁', 'app', 'ro', 'a', 'ch', 'ed'],\n", + " 'approaches': ['▁', 'app', 'ro', 'a', 'ches'],\n", + " 'appropriate': ['▁', 'app', 'ro', 'p', 'ri', 'at', 'e'],\n", + " 'appropriated': ['▁', 'app', 'ro', 'p', 'ri', 'at', 'ed'],\n", + " 'approval': ['▁', 'app', 'ro', 'val'],\n", + " 'approximately': ['▁', 'app', 'ro', 'x', 'i', 'm', 'at', 'e', 'ly'],\n", + " 'april': ['▁a', 'p', 'ri', 'l'],\n", + " 'archbishop': ['▁', 'ar', 'ch', 'b', 'ish', 'op'],\n", + " 'arches': ['▁', 'ar', 'ches'],\n", + " 'archipelago': ['▁', 'ar', 'ch', 'ip', 'el', 'ago'],\n", + " 'architect': ['▁', 'ar', 'ch', 'it', 'e', 'ct'],\n", + " 'architecture': ['▁', 'ar', 'ch', 'it', 'e', 'c', 'ture'],\n", + " 'are': ['▁are'],\n", + " 'area': ['▁are', 'a'],\n", + " 'areas': ['▁are', 'a', 's'],\n", + " 'arguably': ['▁argu', 'abl', 'y'],\n", + " 'argued': ['▁argu', 'ed'],\n", + " 'argues': ['▁argu', 'es'],\n", + " 'arguing': ['▁argu', 'ing'],\n", + " 'argument': ['▁argu', 'ment'],\n", + " 'arguments': ['▁argu', 'ment', 's'],\n", + " 'arise': ['▁', 'aris', 'e'],\n", + " 'arises': ['▁', 'aris', 'es'],\n", + " 'arm': ['▁arm'],\n", + " 'armament': ['▁', 'armament'],\n", + " 'armaments': ['▁', 'armament', 's'],\n", + " 'armed': ['▁arm', 'ed'],\n", + " 'armoured': ['▁arm', 'our', 'ed'],\n", + " 'arms': ['▁arm', 's'],\n", + " \"arms'\": ['▁arm', 's', \"'\"],\n", + " 'army': ['▁arm', 'y'],\n", + " 'arose': ['▁', 'ar', 'ose'],\n", + " 'around': ['▁a', 'round'],\n", + " 'aroused': ['▁', 'ar', 'ous', 'ed'],\n", + " 'arrange': ['▁arrang', 'e'],\n", + " 'arranged': ['▁arrang', 'ed'],\n", + " 'arrangement': ['▁arrang', 'e', 'ment'],\n", + " 'arrangements': ['▁arrang', 'e', 'ment', 's'],\n", + " 'arranging': ['▁arrang', 'ing'],\n", + " 'arrears': ['▁', 'ar', 're', 'ar', 's'],\n", + " 'arrested': ['▁', 'ar', 're', 'st', 'ed'],\n", + " 'arrival': ['▁arriv', 'al'],\n", + " 'arrive': ['▁arriv', 'e'],\n", + " 'arrived': ['▁arriv', 'ed'],\n", + " 'arrives': ['▁arriv', 'es'],\n", + " 'arrogant': ['▁', 'ar', 'ro', 'g', 'ant'],\n", + " 'art': ['▁', 'ar', 't'],\n", + " 'arthur': ['▁', 'ar', 'th', 'ur'],\n", + " 'article': ['▁', 'ar', 'tic', 'le'],\n", + " 'articles': ['▁', 'ar', 'tic', 'le', 's'],\n", + " 'articulation': ['▁', 'ar', 'tic', 'ul', 'ation'],\n", + " 'artistic': ['▁', 'ar', 'tist', 'ic'],\n", + " 'artists': ['▁', 'ar', 'tist', 's'],\n", + " 'as': ['▁as'],\n", + " 'ascents': ['▁as', 'c', 'ent', 's'],\n", + " 'ash': ['▁', 'ash'],\n", + " 'ashen': ['▁', 'ash', 'en'],\n", + " 'ask': ['▁ask'],\n", + " 'asked': ['▁ask', 'ed'],\n", + " 'asking': ['▁ask', 'ing'],\n", + " 'aspect': ['▁a', 'spect'],\n", + " 'aspects': ['▁a', 'spect', 's'],\n", + " 'assault': ['▁as', 's', 'a', 'ul', 't'],\n", + " 'assembler': ['▁as', 'se', 'm', 'ble', 'r'],\n", + " 'assembly': ['▁as', 'se', 'm', 'b', 'ly'],\n", + " 'assess': ['▁as', 'se', 's', 's'],\n", + " 'assessment': ['▁as', 'se', 's', 's', 'ment'],\n", + " 'assistance': ['▁as', 's', 'is', 't', 'ance'],\n", + " 'assistant': ['▁as', 's', 'is', 't', 'ant'],\n", + " 'assistants': ['▁as', 's', 'is', 't', 'ant', 's'],\n", + " 'associate': ['▁associat', 'e'],\n", + " 'associated': ['▁associat', 'ed'],\n", + " 'associates': ['▁associat', 'es'],\n", + " 'association': ['▁associat', 'ion'],\n", + " 'assortment': ['▁as', 's', 'or', 't', 'ment'],\n", + " 'assumption': ['▁assu', 'm', 'ption'],\n", + " 'assurance': ['▁assu', 'r', 'ance'],\n", + " 'astute': ['▁as', 'tu', 'te'],\n", + " 'at': ['▁at'],\n", + " 'ately': ['▁at', 'e', 'ly'],\n", + " 'atlantic': ['▁at', 'l', 'an', 'tic'],\n", + " 'atmosphere': ['▁atmospher', 'e'],\n", + " 'atmospheric': ['▁atmospher', 'ic'],\n", + " 'atomic': ['▁a', 'tom', 'ic'],\n", + " 'atoms': ['▁a', 'tom', 's'],\n", + " 'attach': ['▁at', 't', 'a', 'ch'],\n", + " 'attached': ['▁at', 't', 'a', 'ch', 'ed'],\n", + " 'attack': ['▁at', 'tack'],\n", + " 'attacked': ['▁at', 'tack', 'ed'],\n", + " 'attacks': ['▁at', 'tack', 's'],\n", + " 'attainable': ['▁at', 't', 'ain', 'able'],\n", + " 'attempt': ['▁attempt'],\n", + " 'attempted': ['▁attempt', 'ed'],\n", + " 'attempting': ['▁attempt', 'ing'],\n", + " 'attempts': ['▁attempt', 's'],\n", + " 'atten-': ['▁at', 'ten', '-'],\n", + " 'attend': ['▁at', 't', 'end'],\n", + " 'attendance': ['▁at', 't', 'end', 'ance'],\n", + " 'attended': ['▁at', 't', 'end', 'ed'],\n", + " 'attending': ['▁at', 't', 'end', 'ing'],\n", + " 'attention': ['▁at', 'ten', 'tion'],\n", + " 'attitude': ['▁at', 't', 'it', 'ud', 'e'],\n", + " 'attitudes': ['▁at', 't', 'it', 'ud', 'es'],\n", + " 'attracted': ['▁at', 't', 'ra', 'ct', 'ed'],\n", + " 'attractive': ['▁at', 't', 'ra', 'ct', 'ive'],\n", + " 'aubrey': ['▁a', 'u', 'b', 're', 'y'],\n", + " 'audacity': ['▁a', 'ud', 'a', 'c', 'it', 'y'],\n", + " 'auden': ['▁a', 'ud', 'en'],\n", + " 'audience': ['▁a', 'ud', 'i', 'ence'],\n", + " 'audited': ['▁a', 'ud', 'it', 'ed'],\n", + " 'august': ['▁a', 'ug', 'u', 'st'],\n", + " 'auntie': ['▁a', 'un', 't', 'ie'],\n", + " 'austerity': ['▁a', 'us', 'ter', 'it', 'y'],\n", + " 'australia': ['▁a', 'u', 'st', 'ra', 'li', 'a'],\n", + " 'austria': ['▁a', 'u', 'st', 'ri', 'a'],\n", + " 'austrian': ['▁a', 'u', 'st', 'ri', 'an'],\n", + " 'authentic': ['▁a', 'u', 'th', 'en', 'tic'],\n", + " 'author': ['▁', 'author'],\n", + " 'authorised': ['▁', 'author', 'is', 'ed'],\n", + " 'authorities': ['▁authorit', 'ies'],\n", + " 'authority': ['▁authorit', 'y'],\n", + " 'automatically': ['▁a', 'u', 'tom', 'at', 'ic', 'ally'],\n", + " 'automation': ['▁a', 'u', 'tom', 'ation'],\n", + " 'autumn': ['▁a', 'u', 't', 'um', 'n'],\n", + " 'available': ['▁a', 'v', 'a', 'il', 'able'],\n", + " 'avenue': ['▁a', 'v', 'en', 'u', 'e'],\n", + " 'average': ['▁a', 'ver', 'age'],\n", + " 'averages': ['▁a', 'ver', 'age', 's'],\n", + " 'avert': ['▁a', 'ver', 't'],\n", + " 'aviation': ['▁a', 'v', 'i', 'ation'],\n", + " 'avoid': ['▁a', 'v', 'o', 'i', 'd'],\n", + " 'avoided': ['▁a', 'v', 'o', 'i', 'd', 'ed'],\n", + " 'avon': ['▁a', 'v', 'on'],\n", + " 'awake': ['▁a', 'w', 'ak', 'e'],\n", + " 'awarded': ['▁a', 'ward', 'ed'],\n", + " 'awards': ['▁a', 'ward', 's'],\n", + " 'aware': ['▁a', 'w', 'ar', 'e'],\n", + " 'awareness': ['▁a', 'w', 'ar', 'e', 'ness'],\n", + " 'away': ['▁a', 'way'],\n", + " 'awful': ['▁a', 'w', 'ful'],\n", + " 'awfully': ['▁a', 'w', 'ful', 'ly'],\n", + " 'b': ['▁b'],\n", + " 'b.': ['▁b', '.'],\n", + " 'b.b.c.': ['▁b', '.', 'b', '.', 'c', '.'],\n", + " 'babe': ['▁ba', 'b', 'e'],\n", + " 'babel': ['▁ba', 'b', 'el'],\n", + " 'bably': ['▁b', 'abl', 'y'],\n", + " 'baby': ['▁ba', 'by'],\n", + " 'back': ['▁back'],\n", + " 'backbone': ['▁back', 'bo', 'ne'],\n", + " 'backed': ['▁back', 'ed'],\n", + " 'backers': ['▁back', 'ers'],\n", + " 'background': ['▁back', 'ground'],\n", + " 'backing': ['▁back', 'ing'],\n", + " 'backstage': ['▁back', 'st', 'age'],\n", + " 'backward': ['▁back', 'ward'],\n", + " 'bad': ['▁ba', 'd'],\n", + " 'badly': ['▁ba', 'd', 'ly'],\n", + " 'baffled': ['▁b', 'aff', 'led'],\n", + " 'bag': ['▁ba', 'g'],\n", + " 'bagaya': ['▁ba', 'g', 'a', 'y', 'a'],\n", + " 'baker': ['▁b', 'ak', 'er'],\n", + " 'balance': ['▁b', 'al', 'ance'],\n", + " 'balance-sheet': ['▁b', 'al', 'ance', '-', 's', 'h', 'e', 'et'],\n", + " 'balances': ['▁b', 'al', 'ance', 's'],\n", + " 'bald': ['▁ba', 'ld'],\n", + " 'ball': ['▁ba', 'll'],\n", + " 'balloon': ['▁b', 'al', 'lo', 'on'],\n", + " 'ballyhoo': ['▁b', 'ally', 'h', 'o', 'o'],\n", + " 'baltic': ['▁b', 'al', 'tic'],\n", + " 'ban': ['▁ba', 'n'],\n", + " 'ban-': ['▁ba', 'n', '-'],\n", + " 'ban-the-': ['▁ba', 'n', '-', 'th', 'e', '-'],\n", + " 'bank': ['▁bank'],\n", + " \"bank's\": ['▁bank', \"'\", 's'],\n", + " 'banking': ['▁bank', 'ing'],\n", + " 'bankrupt': ['▁bank', 'r', 'u', 'p', 't'],\n", + " 'banks': ['▁bank', 's'],\n", + " \"banks'\": ['▁bank', 's', \"'\"],\n", + " 'banned': ['▁ba', 'n', 'n', 'ed'],\n", + " 'banzie': ['▁ba', 'n', 'z', 'ie'],\n", + " 'bar': ['▁bar'],\n", + " 'barb': ['▁bar', 'b'],\n", + " 'barbarously': ['▁bar', 'b', 'ar', 'ous', 'ly'],\n", + " 'barclay': ['▁bar', 'c', 'lay'],\n", + " 'bare': ['▁bar', 'e'],\n", + " 'bargain': ['▁bar', 'g', 'ain'],\n", + " 'bargaining': ['▁bar', 'g', 'ain', 'ing'],\n", + " 'bark': ['▁bar', 'k'],\n", + " 'barrier': ['▁bar', 'ri', 'er'],\n", + " 'barriers': ['▁bar', 'ri', 'ers'],\n", + " 'barry': ['▁bar', 'ry'],\n", + " 'base': ['▁base'],\n", + " 'based': ['▁base', 'd'],\n", + " 'bases': ['▁base', 's'],\n", + " 'basic': ['▁basi', 'c'],\n", + " 'basin': ['▁basi', 'n'],\n", + " 'basing': ['▁ba', 's', 'ing'],\n", + " 'basis': ['▁basi', 's'],\n", + " 'baskerville': ['▁ba', 's', 'k', 'er', 'v', 'ill', 'e'],\n", + " 'basses': ['▁ba', 's', 'se', 's'],\n", + " 'basting': ['▁ba', 'st', 'ing'],\n", + " 'bats': ['▁b', 'at', 's'],\n", + " 'batsman': ['▁b', 'at', 's', 'man'],\n", + " 'battalions': ['▁b', 'at', 't', 'al', 'ion', 's'],\n", + " 'batting': ['▁b', 'at', 't', 'ing'],\n", + " 'battle': ['▁ba', 'ttle'],\n", + " 'bavaria': ['▁ba', 'v', 'ar', 'i', 'a'],\n", + " 'bavarian': ['▁ba', 'v', 'ar', 'ian'],\n", + " 'bavarians': ['▁ba', 'v', 'ar', 'ian', 's'],\n", + " 'bay': ['▁ba', 'y'],\n", + " 'be': ['▁be'],\n", + " 'beach': ['▁b', 'each'],\n", + " 'beaches': ['▁be', 'a', 'ches'],\n", + " 'beacon': ['▁be', 'a', 'c', 'on'],\n", + " 'beaks': ['▁be', 'ak', 's'],\n", + " 'bean': ['▁be', 'an'],\n", + " 'bear': ['▁be', 'ar'],\n", + " 'bearer': ['▁be', 'ar', 'er'],\n", + " 'bears': ['▁be', 'ar', 's'],\n", + " 'beastly': ['▁be', 'a', 'st', 'ly'],\n", + " 'beaten': ['▁be', 'at', 'en'],\n", + " 'beautiful': ['▁beautiful'],\n", + " 'beautifully': ['▁beautiful', 'ly'],\n", + " 'beauty': ['▁be', 'a', 'u', 'ty'],\n", + " 'became': ['▁be', 'c', 'a', 'me'],\n", + " 'because': ['▁be', 'cause'],\n", + " 'beckoning': ['▁be', 'ck', 'on', 'ing'],\n", + " 'become': ['▁become'],\n", + " 'becomes': ['▁become', 's'],\n", + " 'becoming': ['▁be', 'coming'],\n", + " 'bedlam': ['▁be', 'd', 'l', 'a', 'm'],\n", + " 'beds': ['▁be', 'd', 's'],\n", + " 'bedspreads': ['▁be', 'd', 's', 'p', 're', 'ad', 's'],\n", + " 'beech': ['▁be', 'e', 'ch'],\n", + " 'been': ['▁be', 'en'],\n", + " 'before': ['▁be', 'fore'],\n", + " 'befriended': ['▁be', 'f', 'ri', 'end', 'ed'],\n", + " 'began': ['▁be', 'g', 'an'],\n", + " 'begin': ['▁be', 'g', 'in'],\n", + " 'beginner': ['▁be', 'g', 'in', 'n', 'er'],\n", + " 'beginning': ['▁be', 'g', 'in', 'n', 'ing'],\n", + " 'begun': ['▁be', 'g', 'un'],\n", + " 'behan': ['▁be', 'h', 'an'],\n", + " 'behaviour': ['▁be', 'h', 'a', 'v', 'i', 'our'],\n", + " 'behind': ['▁be', 'h', 'ind'],\n", + " 'beier': ['▁be', 'i', 'er'],\n", + " 'being': ['▁be', 'ing'],\n", + " 'belgian': ['▁belgi', 'an'],\n", + " 'belgium': ['▁belgi', 'um'],\n", + " 'belgrade': ['▁be', 'l', 'g', 'rade'],\n", + " 'belief': ['▁be', 'li', 'e', 'f'],\n", + " 'believe': ['▁believe'],\n", + " 'believed': ['▁believe', 'd'],\n", + " 'believes': ['▁believe', 's'],\n", + " 'bell': ['▁be', 'll'],\n", + " \"bell's\": ['▁be', 'll', \"'\", 's'],\n", + " 'belonged': ['▁be', 'lo', 'ng', 'ed'],\n", + " 'belongs': ['▁be', 'lo', 'ng', 's'],\n", + " 'below': ['▁be', 'low'],\n", + " 'belt': ['▁be', 'l', 't'],\n", + " 'ben': ['▁be', 'n'],\n", + " 'bench': ['▁be', 'n', 'ch'],\n", + " 'benches': ['▁be', 'n', 'ches'],\n", + " 'bend': ['▁b', 'end'],\n", + " 'bending': ['▁b', 'end', 'ing'],\n", + " 'benefits': ['▁be', 'ne', 'f', 'it', 's'],\n", + " 'bent': ['▁b', 'ent'],\n", + " 'ber': ['▁be', 'r'],\n", + " 'berlin': ['▁berlin'],\n", + " \"berlin's\": ['▁berlin', \"'\", 's'],\n", + " 'bernhard': ['▁b', 'ern', 'h', 'ard'],\n", + " 'berry': ['▁be', 'rry'],\n", + " 'bertrand': ['▁be', 'r', 't', 'r', 'and'],\n", + " 'beset': ['▁be', 's', 'et'],\n", + " 'beside': ['▁be', 'side'],\n", + " 'best': ['▁best'],\n", + " 'best-seller': ['▁best', '-', 'se', 'll', 'er'],\n", + " 'bet': ['▁be', 't'],\n", + " 'betjeman': ['▁be', 't', 'j', 'e', 'man'],\n", + " 'betrayal': ['▁be', 't', 'ra', 'y', 'al'],\n", + " 'betrayed': ['▁be', 't', 'ra', 'y', 'ed'],\n", + " 'better': ['▁better'],\n", + " 'better-': ['▁better', '-'],\n", + " \"betti's\": ['▁be', 'tt', 'i', \"'\", 's'],\n", + " 'between': ['▁be', 'twe', 'en'],\n", + " 'bevel': ['▁be', 've', 'l'],\n", + " 'bevelled': ['▁be', 've', 'll', 'ed'],\n", + " 'beware': ['▁be', 'w', 'ar', 'e'],\n", + " 'bewildered': ['▁be', 'w', 'il', 'd', 'er', 'ed'],\n", + " 'beyond': ['▁be', 'y', 'on', 'd'],\n", + " 'bidet': ['▁b', 'i', 'd', 'et'],\n", + " 'big': ['▁big'],\n", + " 'bigger': ['▁big', 'g', 'er'],\n", + " 'biggest': ['▁big', 'g', 'est'],\n", + " 'bill': ['▁b', 'ill'],\n", + " 'bills': ['▁b', 'ill', 's'],\n", + " 'binding': ['▁b', 'ind', 'ing'],\n", + " 'biological': ['▁b', 'i', 'o', 'lo', 'g', 'ic', 'al'],\n", + " 'bird': ['▁b', 'ir', 'd'],\n", + " 'birds': ['▁b', 'ir', 'd', 's'],\n", + " 'bishop': ['▁b', 'ish', 'op'],\n", + " 'bit': ['▁', 'bit'],\n", + " 'bite': ['▁', 'bit', 'e'],\n", + " 'bits': ['▁', 'bit', 's'],\n", + " 'bitter-sweet': ['▁', 'bit', 'ter', '-', 's', 'w', 'e', 'et'],\n", + " 'bitterest': ['▁', 'bit', 'ter', 'est'],\n", + " 'bitterly': ['▁', 'bit', 'ter', 'ly'],\n", + " 'bituminized': ['▁', 'bit', 'um', 'in', 'i', 'z', 'ed'],\n", + " 'black': ['▁b', 'l', 'a', 'ck'],\n", + " 'black-': ['▁b', 'l', 'a', 'ck', '-'],\n", + " 'blackbird': ['▁b', 'l', 'a', 'ck', 'b', 'ir', 'd'],\n", + " 'blacks': ['▁b', 'l', 'a', 'ck', 's'],\n", + " 'blame': ['▁b', 'l', 'a', 'me'],\n", + " 'blamed': ['▁b', 'l', 'a', 'm', 'ed'],\n", + " 'blander': ['▁b', 'land', 'er'],\n", + " 'blank': ['▁b', 'l', 'an', 'k'],\n", + " 'blight': ['▁b', 'l', 'ight'],\n", + " 'blind': ['▁b', 'lin', 'd'],\n", + " 'blinked': ['▁b', 'lin', 'k', 'ed'],\n", + " 'block': ['▁block'],\n", + " 'blocks': ['▁block', 's'],\n", + " 'bloem-': ['▁b', 'lo', 'e', 'm', '-'],\n", + " 'blond': ['▁b', 'l', 'on', 'd'],\n", + " 'blood': ['▁b', 'l', 'ood'],\n", + " 'bloodstained': ['▁b', 'l', 'ood', 'st', 'ain', 'ed'],\n", + " 'bloody': ['▁b', 'l', 'ood', 'y'],\n", + " 'blouse': ['▁b', 'l', 'ous', 'e'],\n", + " 'blouses': ['▁b', 'l', 'ous', 'es'],\n", + " 'blow': ['▁b', 'low'],\n", + " 'blowflies': ['▁b', 'low', 'f', 'l', 'ies'],\n", + " 'blown': ['▁b', 'low', 'n'],\n", + " 'blue': ['▁b', 'l', 'u', 'e'],\n", + " 'blunt': ['▁b', 'l', 'un', 't'],\n", + " 'bluntly': ['▁b', 'l', 'un', 't', 'ly'],\n", + " 'bluster': ['▁b', 'l', 'us', 'ter'],\n", + " 'board': ['▁bo', 'ard'],\n", + " 'boat': ['▁bo', 'at'],\n", + " 'boat-train': ['▁bo', 'at', '-', 'train'],\n", + " 'bobby': ['▁bo', 'b', 'by'],\n", + " 'bodies': ['▁bo', 'd', 'ies'],\n", + " 'body': ['▁', 'body'],\n", + " 'boeing': ['▁bo', 'e', 'ing'],\n", + " 'bogy': ['▁bo', 'g', 'y'],\n", + " 'boiled': ['▁bo', 'il', 'ed'],\n", + " 'boils': ['▁bo', 'il', 's'],\n", + " 'bold': ['▁bo', 'ld'],\n", + " 'boldly': ['▁bo', 'ld', 'ly'],\n", + " 'bolted': ['▁bo', 'l', 't', 'ed'],\n", + " 'bomb': ['▁', 'bomb'],\n", + " 'bombay': ['▁', 'bomb', 'a', 'y'],\n", + " 'bombed': ['▁', 'bomb', 'ed'],\n", + " 'bombers': ['▁bo', 'mber', 's'],\n", + " 'bonded': ['▁bo', 'n', 'd', 'ed'],\n", + " 'bone': ['▁bo', 'ne'],\n", + " 'bones': ['▁bo', 'ne', 's'],\n", + " 'bonn': ['▁bo', 'n', 'n'],\n", + " 'book': ['▁bo', 'ok'],\n", + " 'booklet': ['▁bo', 'ok', 'le', 't'],\n", + " 'books': ['▁bo', 'ok', 's'],\n", + " 'booming': ['▁bo', 'o', 'm', 'ing'],\n", + " 'border': ['▁b', 'or', 'd', 'er'],\n", + " 'bore': ['▁bo', 're'],\n", + " 'bored': ['▁b', 'or', 'ed'],\n", + " 'boredom': ['▁b', 'or', 'ed', 'o', 'm'],\n", + " 'bores': ['▁bo', 're', 's'],\n", + " 'borough': ['▁b', 'or', 'ough'],\n", + " 'borrow': ['▁b', 'or', 'ro', 'w'],\n", + " 'borstal': ['▁b', 'or', 'st', 'al'],\n", + " 'bosoms': ['▁bo', 's', 'o', 'm', 's'],\n", + " 'bossed': ['▁bo', 's', 's', 'ed'],\n", + " 'bosses': ['▁bo', 's', 'se', 's'],\n", + " 'both': ['▁bo', 'th'],\n", + " 'bottle': ['▁bo', 'ttle'],\n", + " 'bottom': ['▁bo', 't', 'tom'],\n", + " 'bought': ['▁b', 'ought'],\n", + " 'boun': ['▁bo', 'un'],\n", + " 'bound': ['▁b', 'ound'],\n", + " 'boutiques': ['▁b', 'out', 'i', 'q', 'u', 'es'],\n", + " 'bow': ['▁bo', 'w'],\n", + " 'bow-street': ['▁bo', 'w', '-', 'street'],\n", + " 'bowed': ['▁bo', 'w', 'ed'],\n", + " 'bowing': ['▁bo', 'w', 'ing'],\n", + " 'bows': ['▁bo', 'w', 's'],\n", + " 'box': ['▁bo', 'x'],\n", + " 'boxes': ['▁bo', 'x', 'es'],\n", + " 'boxing': ['▁bo', 'x', 'ing'],\n", + " 'boy': ['▁bo', 'y'],\n", + " 'boycotted': ['▁bo', 'y', 'c', 'o', 'tt', 'ed'],\n", + " 'boycotting': ['▁bo', 'y', 'c', 'o', 'tt', 'ing'],\n", + " 'boyle': ['▁bo', 'y', 'le'],\n", + " 'boys': ['▁bo', 'y', 's'],\n", + " 'braces': ['▁br', 'a', 'ce', 's'],\n", + " 'brain': ['▁br', 'ain'],\n", + " 'brain-activity': ['▁br', 'ain', '-', 'a', 'ct', 'i', 'v', 'it', 'y'],\n", + " 'brain-children': ['▁br', 'ain', '-', 'ch', 'il', 'd', 'ren'],\n", + " 'brains': ['▁br', 'ain', 's'],\n", + " 'brandy': ['▁br', 'and', 'y'],\n", + " 'brash': ['▁br', 'ash'],\n", + " 'brass': ['▁br', 'a', 's', 's'],\n", + " 'brauchitsch': ['▁br', 'a', 'uch', 'it', 's', 'ch'],\n", + " 'breach': ['▁br', 'each'],\n", + " 'bread-and-butter': ['▁br', 'e', 'ad', '-', 'and', '-', 'b', 'u', 't', 'ter'],\n", + " 'break': ['▁br', 'e', 'ak'],\n", + " 'breaking': ['▁br', 'e', 'ak', 'ing'],\n", + " 'breaks': ['▁br', 'e', 'ak', 's'],\n", + " 'breath': ['▁br', 'e', 'at', 'h'],\n", + " 'breathing': ['▁br', 'e', 'a', 'thing'],\n", + " 'breathless': ['▁br', 'e', 'at', 'h', 'less'],\n", + " 'breeding': ['▁br', 'e', 'ed', 'ing'],\n", + " 'breezily': ['▁br', 'e', 'e', 'z', 'i', 'ly'],\n", + " 'brenda': ['▁br', 'end', 'a'],\n", + " 'brendan': ['▁br', 'end', 'an'],\n", + " \"brendan's\": ['▁br', 'end', 'an', \"'\", 's'],\n", + " 'brentano': ['▁br', 'ent', 'an', 'o'],\n", + " 'brezhnev': ['▁br', 'e', 'z', 'h', 'ne', 'v'],\n", + " 'brian': ['▁br', 'ian'],\n", + " 'bridal': ['▁', 'brid', 'al'],\n", + " 'bride': ['▁', 'brid', 'e'],\n", + " 'brief': ['▁brief'],\n", + " 'briefcase': ['▁brief', 'case'],\n", + " 'briefing': ['▁brief', 'ing'],\n", + " 'brigadiers': ['▁br', 'ig', 'ad', 'i', 'ers'],\n", + " 'bright': ['▁br', 'ight'],\n", + " 'brighter': ['▁br', 'ight', 'er'],\n", + " 'brightly': ['▁br', 'ight', 'ly'],\n", + " \"brighton's\": ['▁br', 'ight', 'on', \"'\", 's'],\n", + " 'brilliant': ['▁br', 'ill', 'ian', 't'],\n", + " 'brilliantly': ['▁br', 'ill', 'ian', 't', 'ly'],\n", + " 'bring': ['▁br', 'ing'],\n", + " 'brings': ['▁br', 'ing', 's'],\n", + " 'bristled': ['▁br', 'is', 't', 'led'],\n", + " 'britain': ['▁britain'],\n", + " \"britain's\": ['▁britain', \"'\", 's'],\n", + " 'british': ['▁british'],\n", + " 'british-owned': ['▁british', '-', 'ow', 'n', 'ed'],\n", + " 'britishers': ['▁british', 'ers'],\n", + " 'brittle': ['▁br', 'i', 'ttle'],\n", + " 'broad': ['▁br', 'o', 'ad'],\n", + " 'broadcast': ['▁br', 'o', 'ad', 'c', 'a', 'st'],\n", + " 'broadcasting': ['▁br', 'o', 'ad', 'c', 'a', 'st', 'ing'],\n", + " 'broke': ['▁br', 'ok', 'e'],\n", + " 'broken': ['▁br', 'ok', 'en'],\n", + " \"brook's\": ['▁br', 'o', 'ok', \"'\", 's'],\n", + " 'brother': ['▁br', 'other'],\n", + " 'brother-': ['▁br', 'other', '-'],\n", + " 'brother-in-law': ['▁br', 'other', '-', 'in', '-', 'law'],\n", + " 'brought': ['▁br', 'ought'],\n", + " 'brown': ['▁brown'],\n", + " \"brown's\": ['▁brown', \"'\", 's'],\n", + " 'bru\"cke': ['▁br', 'u', '\"', 'ck', 'e'],\n", + " 'bruce': ['▁br', 'u', 'ce'],\n", + " 'bruno': ['▁br', 'un', 'o'],\n", + " 'brunswick': ['▁br', 'un', 's', 'w', 'i', 'ck'],\n", + " 'brussels': ['▁br', 'us', 's', 'el', 's'],\n", + " 'brutal': ['▁br', 'u', 't', 'al'],\n", + " 'bryan': ['▁br', 'y', 'an'],\n", + " 'buck': ['▁b', 'u', 'ck'],\n", + " 'buckingham': ['▁b', 'u', 'ck', 'ing', 'h', 'a', 'm'],\n", + " 'buckley': ['▁b', 'u', 'ck', 'le', 'y'],\n", + " 'budge': ['▁b', 'ud', 'ge'],\n", + " 'budgerigar': ['▁b', 'ud', 'g', 'er', 'ig', 'ar'],\n", + " 'budget': ['▁b', 'ud', 'ge', 't'],\n", + " 'budgetary': ['▁b', 'ud', 'ge', 't', 'ar', 'y'],\n", + " 'budgette': ['▁b', 'ud', 'ge', 't', 'te'],\n", + " 'buganda': ['▁b', 'ug', 'and', 'a'],\n", + " 'build': ['▁b', 'u', 'il', 'd'],\n", + " 'building': ['▁b', 'u', 'il', 'd', 'ing'],\n", + " 'built': ['▁b', 'u', 'il', 't'],\n", + " 'bulk': ['▁b', 'ul', 'k'],\n", + " 'bulky': ['▁b', 'ul', 'k', 'y'],\n", + " 'bulletins': ['▁b', 'ul', 'le', 't', 'in', 's'],\n", + " 'bullying': ['▁b', 'ul', 'ly', 'ing'],\n", + " 'bumps': ['▁b', 'um', 'p', 's'],\n", + " 'bundle': ['▁b', 'un', 'd', 'le'],\n", + " 'bundy': ['▁b', 'un', 'd', 'y'],\n", + " 'bunk': ['▁b', 'un', 'k'],\n", + " 'burden': ['▁burde', 'n'],\n", + " 'burdens': ['▁burde', 'n', 's'],\n", + " 'burdett': ['▁burde', 'tt'],\n", + " 'bureau': ['▁b', 'ure', 'a', 'u'],\n", + " 'burge': ['▁b', 'ur', 'ge'],\n", + " 'burning': ['▁b', 'ur', 'n', 'ing'],\n", + " 'burst': ['▁b', 'ur', 'st'],\n", + " 'bush': ['▁bus', 'h'],\n", + " 'busi-': ['▁bus', 'i', '-'],\n", + " 'business': ['▁bus', 'i', 'ness'],\n", + " 'bustling': ['▁bus', 't', 'l', 'ing'],\n", + " 'busy': ['▁bus', 'y'],\n", + " 'but': ['▁but'],\n", + " 'butagas': ['▁but', 'a', 'g', 'a', 's'],\n", + " 'butler': ['▁but', 'l', 'er'],\n", + " 'butt': ['▁but', 't'],\n", + " 'buttery': ['▁but', 'ter', 'y'],\n", + " 'button': ['▁but', 'ton'],\n", + " 'buy': ['▁b', 'u', 'y'],\n", + " 'buying': ['▁b', 'u', 'y', 'ing'],\n", + " 'by': ['▁by'],\n", + " 'by-': ['▁by', '-'],\n", + " 'by-elections': ['▁by', '-', 'elect', 'ion', 's'],\n", + " 'by-passed': ['▁by', '-', 'p', 'a', 's', 's', 'ed'],\n", + " 'byrne': ['▁by', 'r', 'ne'],\n", + " 'byron': ['▁by', 'ro', 'n'],\n", + " 'c': ['▁c'],\n", + " 'c.c.f.': ['▁c', '.', 'c', '.', 'f', '.'],\n", + " 'c.i.g.s.': ['▁c', '.', 'i', '.', 'g', '.', 's', '.'],\n", + " 'cabinet': ['▁c', 'a', 'b', 'ine', 't'],\n", + " 'cable': ['▁c', 'able'],\n", + " 'cables': ['▁c', 'able', 's'],\n", + " 'cagnes': ['▁c', 'a', 'g', 'ne', 's'],\n", + " 'call': ['▁call'],\n", + " 'call-out': ['▁call', '-', 'out'],\n", + " 'callaghan': ['▁call', 'a', 'g', 'h', 'an'],\n", + " 'called': ['▁call', 'ed'],\n", + " 'calls': ['▁call', 's'],\n", + " 'cally': ['▁call', 'y'],\n", + " 'calm': ['▁c', 'al', 'm'],\n", + " 'calorie': ['▁c', 'al', 'or', 'ie'],\n", + " 'cambodia': ['▁cam', 'bo', 'd', 'i', 'a'],\n", + " 'cambridge': ['▁cam', 'brid', 'ge'],\n", + " 'came': ['▁cam', 'e'],\n", + " 'camp': ['▁cam', 'p'],\n", + " 'campaign': ['▁campaign'],\n", + " 'campaigns': ['▁campaign', 's'],\n", + " 'can': ['▁can'],\n", + " \"can't\": ['▁can', \"'\", 't'],\n", + " 'can-': ['▁can', '-'],\n", + " 'canada': ['▁can', 'ad', 'a'],\n", + " 'canadian': ['▁can', 'ad', 'ian'],\n", + " 'canadians': ['▁can', 'ad', 'ian', 's'],\n", + " 'canals': ['▁can', 'al', 's'],\n", + " 'canard': ['▁can', 'ard'],\n", + " 'cancelled': ['▁can', 'ce', 'll', 'ed'],\n", + " 'cancerous': ['▁can', 'c', 'er', 'ous'],\n", + " 'candi-': ['▁can', 'd', 'i', '-'],\n", + " 'candidate': ['▁can', 'did', 'at', 'e'],\n", + " 'candles': ['▁can', 'd', 'le', 's'],\n", + " 'cannot': ['▁can', 'not'],\n", + " 'canon': ['▁can', 'on'],\n", + " 'cantabile': ['▁can', 't', 'a', 'b', 'il', 'e'],\n", + " 'canteen': ['▁can', 'te', 'en'],\n", + " ...}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processor.lexicon" + ] + }, + { + "cell_type": "code", "execution_count": 167, "id": "d08a0259", "metadata": {}, diff --git a/notebooks/04b-look-at-iam-paragraphs.ipynb b/notebooks/04b-look-at-iam-paragraphs.ipynb index dc0aef6..11ebddf 100644 --- a/notebooks/04b-look-at-iam-paragraphs.ipynb +++ b/notebooks/04b-look-at-iam-paragraphs.ipynb @@ -256,7 +256,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.2" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/text_recognizer/data/iam_preprocessor.py b/text_recognizer/data/iam_preprocessor.py index d85787e..60f8a9f 100644 --- a/text_recognizer/data/iam_preprocessor.py +++ b/text_recognizer/data/iam_preprocessor.py @@ -119,7 +119,6 @@ class Preprocessor: continue self.text.append(example["text"].lower()) - def _to_index(self, line: str) -> torch.LongTensor: if line in self.special_tokens: return torch.LongTensor([self.tokens_to_index[line]]) diff --git a/text_recognizer/data/mapping.py b/text_recognizer/data/mapping.py deleted file mode 100644 index f0edf3f..0000000 --- a/text_recognizer/data/mapping.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Mapping to and from word pieces.""" -from pathlib import Path - - -class WordPieces: - - def __init__(self) -> None: - pass diff --git a/text_recognizer/data/mappings.py b/text_recognizer/data/mappings.py new file mode 100644 index 0000000..cfa0ec7 --- /dev/null +++ b/text_recognizer/data/mappings.py @@ -0,0 +1,143 @@ +"""Mapping to and from word pieces.""" +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Optional, Union, Sequence + +from loguru import logger +import torch +from torch import Tensor + +from text_recognizer.data.emnist import emnist_mapping +from text_recognizer.datasets.iam_preprocessor import Preprocessor + + +class AbstractMapping(ABC): + @abstractmethod + def get_token(self, *args, **kwargs) -> str: + ... + + @abstractmethod + def get_index(self, *args, **kwargs) -> Tensor: + ... + + @abstractmethod + def get_text(self, *args, **kwargs) -> str: + ... + + @abstractmethod + def get_indices(self, *args, **kwargs) -> Tensor: + ... + + +class EmnistMapping(AbstractMapping): + def __init__(self, extra_symbols: Optional[Sequence[str]]) -> None: + self.mapping, self.inverse_mapping, self.input_size = emnist_mapping( + extra_symbols + ) + + def get_token(self, index: Union[int, Tensor]) -> str: + if (index := int(index)) in self.mapping: + return self.mapping[index] + raise KeyError(f"Index ({index}) not in mapping.") + + def get_index(self, token: str) -> Tensor: + if token in self.inverse_mapping: + return Tensor(self.inverse_mapping[token]) + raise KeyError(f"Token ({token}) not found in inverse mapping.") + + def get_text(self, indices: Union[List[int], Tensor]) -> str: + if isinstance(indices, Tensor): + indices = indices.tolist() + return "".join([self.mapping[index] for index in indices]) + + def get_indices(self, text: str) -> Tensor: + return Tensor([self.inverse_mapping[token] for token in text]) + + +class WordPieceMapping(EmnistMapping): + def __init__( + self, + num_features: int, + tokens: str, + lexicon: str, + data_dir: Optional[Union[str, Path]] = None, + use_words: bool = False, + prepend_wordsep: bool = False, + special_tokens: Sequence[str] = ("<s>", "<e>", "<p>"), + extra_symbols: Optional[Sequence[str]] = None, + ) -> None: + super().__init__(extra_symbols) + self.wordpiece_processor = self._configure_wordpiece_processor( + num_features, + tokens, + lexicon, + data_dir, + use_words, + prepend_wordsep, + special_tokens, + extra_symbols, + ) + + def _configure_wordpiece_processor( + self, + num_features: int, + tokens: str, + lexicon: str, + data_dir: Optional[Union[str, Path]], + use_words: bool, + prepend_wordsep: bool, + special_tokens: Optional[Sequence[str]], + extra_symbols: Optional[Sequence[str]], + ) -> Preprocessor: + data_dir = ( + (Path(__file__).resolve().parents[2] / "data" / "raw" / "iam" / "iamdb") + if data_dir is None + else Path(data_dir) + ) + + logger.debug(f"Using data dir: {data_dir}") + if not data_dir.exists(): + raise RuntimeError(f"Could not locate iamdb directory at {data_dir}") + + processed_path = ( + Path(__file__).resolve().parents[2] / "data" / "processed" / "iam_lines" + ) + + tokens_path = processed_path / tokens + lexicon_path = processed_path / lexicon + + if extra_symbols is not None: + special_tokens += extra_symbols + + return Preprocessor( + data_dir, + num_features, + tokens_path, + lexicon_path, + use_words, + prepend_wordsep, + special_tokens, + ) + + def get_token(self, index: Union[int, Tensor]) -> str: + if (index := int(index)) <= self.wordpiece_processor.num_tokens: + return self.wordpiece_processor.tokens[index] + raise KeyError(f"Index ({index}) not in mapping.") + + def get_index(self, token: str) -> Tensor: + if token in self.wordpiece_processor.tokens: + return torch.LongTensor(self.wordpiece_processor.tokens_to_index[token]) + raise KeyError(f"Token ({token}) not found in inverse mapping.") + + def get_text(self, indices: Union[List[int], Tensor]) -> str: + if isinstance(indices, Tensor): + indices = indices.tolist() + return self.wordpiece_processor.to_text(indices) + + def get_indices(self, text: str) -> Tensor: + return self.wordpiece_processor.to_index(text) + + def emnist_to_wordpiece_indices(self, x: Tensor) -> Tensor: + text = self.mapping.get_text(x) + text = text.lower().replace(" ", "▁") + return torch.LongTensor(self.wordpiece_processor.to_index(text)) diff --git a/text_recognizer/data/transforms.py b/text_recognizer/data/transforms.py index 297c953..f53df64 100644 --- a/text_recognizer/data/transforms.py +++ b/text_recognizer/data/transforms.py @@ -1,115 +1,36 @@ """Transforms for PyTorch datasets.""" -from abc import abstractmethod from pathlib import Path -from typing import Any, Optional, Union +from typing import Optional, Union, Sequence -from loguru import logger -import torch from torch import Tensor -from text_recognizer.datasets.iam_preprocessor import Preprocessor -from text_recognizer.data.emnist import emnist_mapping +from text_recognizer.datasets.mappings import WordPieceMapping -class ToLower: - """Converts target to lower case.""" - - def __call__(self, target: Tensor) -> Tensor: - """Corrects index value in target tensor.""" - device = target.device - return torch.stack([x - 26 if x > 35 else x for x in target]).to(device) - - -class ToCharcters: - """Converts integers to characters.""" - - def __init__(self, extra_symbols: Optional[List[str]] = None) -> None: - self.mapping, _, _ = emnist_mapping(extra_symbols) - - def __call__(self, y: Tensor) -> str: - """Converts a Tensor to a str.""" - return "".join([self.mapping[int(i)] for i in y]).replace(" ", "▁") - - -class WordPieces: - """Abstract transform for word pieces.""" +class WordPiece: + """Converts EMNIST indices to Word Piece indices.""" def __init__( self, num_features: int, + tokens: str, + lexicon: str, data_dir: Optional[Union[str, Path]] = None, - tokens: Optional[Union[str, Path]] = None, - lexicon: Optional[Union[str, Path]] = None, use_words: bool = False, prepend_wordsep: bool = False, + special_tokens: Sequence[str] = ("<s>", "<e>", "<p>"), + extra_symbols: Optional[Sequence[str]] = None, ) -> None: - if data_dir is None: - data_dir = ( - Path(__file__).resolve().parents[3] / "data" / "raw" / "iam" / "iamdb" - ) - logger.debug(f"Using data dir: {data_dir}") - if not data_dir.exists(): - raise RuntimeError(f"Could not locate iamdb directory at {data_dir}") - else: - data_dir = Path(data_dir) - processed_path = ( - Path(__file__).resolve().parents[3] / "data" / "processed" / "iam_lines" - ) - tokens_path = processed_path / tokens - lexicon_path = processed_path / lexicon - - self.preprocessor = Preprocessor( - data_dir, + self.mapping = WordPieceMapping( num_features, - tokens_path, - lexicon_path, + tokens, + lexicon, + data_dir, use_words, prepend_wordsep, + special_tokens, + extra_symbols, ) - @abstractmethod - def __call__(self, *args, **kwargs) -> Any: - """Transforms input.""" - ... - - -class ToWordPieces(WordPieces): - """Transforms str to word pieces.""" - - def __init__( - self, - num_features: int, - data_dir: Optional[Union[str, Path]] = None, - tokens: Optional[Union[str, Path]] = None, - lexicon: Optional[Union[str, Path]] = None, - use_words: bool = False, - prepend_wordsep: bool = False, - ) -> None: - super().__init__( - num_features, data_dir, tokens, lexicon, use_words, prepend_wordsep - ) - - def __call__(self, line: str) -> Tensor: - """Transforms str to word pieces.""" - return self.preprocessor.to_index(line) - - -class ToText(WordPieces): - """Takes word pieces and converts them to text.""" - - def __init__( - self, - num_features: int, - data_dir: Optional[Union[str, Path]] = None, - tokens: Optional[Union[str, Path]] = None, - lexicon: Optional[Union[str, Path]] = None, - use_words: bool = False, - prepend_wordsep: bool = False, - ) -> None: - super().__init__( - num_features, data_dir, tokens, lexicon, use_words, prepend_wordsep - ) - - def __call__(self, x: Tensor) -> str: - """Converts tensor to text.""" - return self.preprocessor.to_text(x.tolist()) + def __call__(self, x: Tensor) -> Tensor: + return self.mapping.emnist_to_wordpiece_indices(x) diff --git a/text_recognizer/models/base.py b/text_recognizer/models/base.py index c6d5d73..aeda039 100644 --- a/text_recognizer/models/base.py +++ b/text_recognizer/models/base.py @@ -49,7 +49,9 @@ class LitBaseModel(pl.LightningModule): optimizer_class = getattr(torch.optim, self._optimizer.type) return optimizer_class(params=self.parameters(), **args) - def _configure_lr_scheduler(self, optimizer: Type[torch.optim.Optimizer]) -> Dict[str, Any]: + def _configure_lr_scheduler( + self, optimizer: Type[torch.optim.Optimizer] + ) -> Dict[str, Any]: """Configures the lr scheduler.""" scheduler = {"monitor": self.monitor} args = {} or self._lr_scheduler.args @@ -59,7 +61,7 @@ class LitBaseModel(pl.LightningModule): scheduler["scheduler"] = getattr( torch.optim.lr_scheduler, self._lr_scheduler.type - )(optimizer, **args) + )(optimizer, **args) return scheduler diff --git a/text_recognizer/networks/image_transformer.py b/text_recognizer/networks/image_transformer.py index daededa..a6aaca4 100644 --- a/text_recognizer/networks/image_transformer.py +++ b/text_recognizer/networks/image_transformer.py @@ -44,7 +44,9 @@ class ImageTransformer(nn.Module): dropout_rate: float = 0.1, transformer_activation: str = "glu", ) -> None: - self.vocab_size = NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size + self.vocab_size = ( + NUM_WORD_PIECES + NUM_SPECIAL_TOKENS if vocab_size is None else vocab_size + ) self.hidden_dim = hidden_dim self.max_output_length = output_shape[0] diff --git a/training/configs/image_transformer.yaml b/training/configs/image_transformer.yaml index 88c05c2..228e53f 100644 --- a/training/configs/image_transformer.yaml +++ b/training/configs/image_transformer.yaml @@ -56,6 +56,7 @@ callbacks: args: monitor: val_loss mode: min + save_last: true - type: StochasticWeightAveraging args: swa_epoch_start: 0.8 diff --git a/training/run_experiment.py b/training/run_experiment.py index f46803f..1e79461 100644 --- a/training/run_experiment.py +++ b/training/run_experiment.py @@ -15,7 +15,7 @@ import wandb SEED = 4711 CONFIGS_DIRNAME = Path(__file__).parent.resolve() / "configs" -LOGS_DIRNAME = Path(__file__).parent.resolve() / "runs" / "logs" +LOGS_DIRNAME = Path(__file__).parent.resolve() / "logs" def _configure_logging(log_dir: Optional[Path], verbose: int = 0) -> None: @@ -67,7 +67,7 @@ def _configure_callbacks( def _configure_logger( - network: Type[nn.Module], args: Dict, log_dir: str, use_wandb: bool + network: Type[nn.Module], args: Dict, log_dir: str, use_wandb: bool ) -> Type[pl.loggers.LightningLoggerBase]: """Configures lightning logger.""" if use_wandb: @@ -124,7 +124,6 @@ def run( _configure_logging(None, verbose=verbose) logger.info("Starting experiment...") - # Seed everything in the experiment. logger.info(f"Seeding everthing with seed={SEED}") pl.utilities.seed.seed_everything(SEED) |