1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
|
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"from pathlib import Path\n",
"import numpy as np\n",
"from PIL import Image\n",
"import torch.nn.functional as F\n",
"import torch\n",
"from torch import nn\n",
"from torchsummary import summary\n",
"from importlib.util import find_spec\n",
"if find_spec(\"text_recognizer\") is None:\n",
" import sys\n",
" sys.path.append('..')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"path = Path(\"../\").resolve().parent / \"data\" / \"processed\" / \"iam_lines\" / \"iamdb_1kwp_lex_1000.txt\""
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('/home/akternurra/Documents/projects/quest-for-general-artifical-intelligence/projects/text-recognizer/data/processed/iam_lines/iamdb_1kwp_lex_1000.txt')"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"with open(path, \"r\") as f:\n",
" lex = (line.strip().split() for line in f)\n",
" lex = {line[0]: line[1:] for line in lex}\n",
" #print(len(lex))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'!': ['▁', '!'],\n",
" '\"': ['▁', '\"'],\n",
" '&': ['▁', '&'],\n",
" \"'\": ['▁', \"'\"],\n",
" \"'30s\": ['▁', \"'\", '3', '0', 's'],\n",
" \"'61\": ['▁', \"'\", '6', '1'],\n",
" \"'d\": ['▁', \"'\", 'd'],\n",
" \"'ll\": ['▁', \"'\", 'll'],\n",
" \"'m\": ['▁', \"'\", 'm'],\n",
" \"'re\": ['▁', \"'\", 're'],\n",
" \"'s\": ['▁', \"'\", 's'],\n",
" \"'ve\": ['▁', \"'\", 've'],\n",
" '(': ['▁', '('],\n",
" ')': ['▁', ')'],\n",
" '*': ['▁', '*'],\n",
" '+2.8': ['▁', '+', '2', '.', '8'],\n",
" '+3.6': ['▁', '+', '3', '.', '6'],\n",
" ',': ['▁', ','],\n",
" '-': ['▁', '-'],\n",
" '-2.6': ['▁', '-', '2', '.', '6'],\n",
" '-5.4': ['▁', '-', '5', '.', '4'],\n",
" '.': ['▁', '.'],\n",
" '...': ['▁', '.', '.', '.'],\n",
" '0m': ['▁', '0', 'm'],\n",
" '1': ['▁', '1'],\n",
" '1,157': ['▁', '1', ',', '1', '5', '7'],\n",
" '1,400': ['▁', '1', ',', '4', '0', '0'],\n",
" '1,500': ['▁', '1', ',', '5', '0', '0'],\n",
" '1-2': ['▁', '1', '-', '2'],\n",
" '1.8': ['▁', '1', '.', '8'],\n",
" '1/2': ['▁', '1', '/', '2'],\n",
" '1/2-in.-long': ['▁', '1', '/', '2', '-', 'in', '.', '-', 'long'],\n",
" '1/4': ['▁', '1', '/', '4'],\n",
" '10': ['▁', '10'],\n",
" '10,000': ['▁', '10', ',', '0', '0', '0'],\n",
" '100': ['▁', '10', '0'],\n",
" '100,000,000': ['▁', '10', '0', ',', '0', '00,000'],\n",
" '104': ['▁', '10', '4'],\n",
" '11': ['▁', '1', '1'],\n",
" '12': ['▁', '1', '2'],\n",
" '12,000-word': ['▁', '1', '2', ',', '0', '0', '0', '-', 'word'],\n",
" '125': ['▁', '1', '2', '5'],\n",
" '13': ['▁', '1', '3'],\n",
" '13,000': ['▁', '1', '3', ',', '0', '0', '0'],\n",
" '14': ['▁', '1', '4'],\n",
" '15': ['▁', '1', '5'],\n",
" '15,000,000': ['▁', '1', '5', ',', '0', '00,000'],\n",
" '15-17': ['▁', '1', '5', '-', '1', '7'],\n",
" '15-nation': ['▁', '1', '5', '-', 'n', 'ation'],\n",
" '15-year-olds': ['▁', '1', '5', '-', 'year', '-', 'old', 's'],\n",
" '150,000,000': ['▁', '1', '5', '0', ',', '0', '00,000'],\n",
" '16': ['▁', '1', '6'],\n",
" '16,000': ['▁', '1', '6', ',', '0', '0', '0'],\n",
" '160': ['▁', '1', '6', '0'],\n",
" '163,000,000': ['▁', '1', '6', '3', ',', '0', '00,000'],\n",
" '167': ['▁', '1', '6', '7'],\n",
" '17': ['▁', '1', '7'],\n",
" '18': ['▁', '1', '8'],\n",
" '18.1': ['▁', '1', '8', '.', '1'],\n",
" '1830': ['▁', '1', '8', '3', '0'],\n",
" \"1830's\": ['▁', '1', '8', '3', '0', \"'\", 's'],\n",
" '1834': ['▁', '1', '8', '3', '4'],\n",
" '1897': ['▁', '1', '8', '9', '7'],\n",
" '19': ['▁', '1', '9'],\n",
" '19.5': ['▁', '1', '9', '.', '5'],\n",
" '1910': ['▁', '1', '9', '10'],\n",
" '1913': ['▁', '1', '9', '1', '3'],\n",
" '1914': ['▁', '1', '9', '1', '4'],\n",
" '1914-18': ['▁', '1', '9', '1', '4', '-', '1', '8'],\n",
" '1918': ['▁', '1', '9', '1', '8'],\n",
" '1920': ['▁', '1', '9', '2', '0'],\n",
" '1930': ['▁', '1', '9', '3', '0'],\n",
" '1931': ['▁', '1', '9', '3', '1'],\n",
" '1932': ['▁', '1', '9', '3', '2'],\n",
" '1934': ['▁', '1', '9', '3', '4'],\n",
" '1936': ['▁', '1', '9', '3', '6'],\n",
" '1939': ['▁', '1', '9', '3', '9'],\n",
" '1943': ['▁', '1', '9', '4', '3'],\n",
" '1944': ['▁', '1', '9', '4', '4'],\n",
" '1950': ['▁', '1', '9', '5', '0'],\n",
" '1951': ['▁', '1', '9', '5', '1'],\n",
" '1952': ['▁', '1', '9', '5', '2'],\n",
" '1953': ['▁', '1', '9', '5', '3'],\n",
" '1954': ['▁', '1', '9', '5', '4'],\n",
" '1956': ['▁', '1', '9', '5', '6'],\n",
" '1957': ['▁', '1', '9', '5', '7'],\n",
" '1958': ['▁', '1', '9', '5', '8'],\n",
" '1959': ['▁', '1', '9', '5', '9'],\n",
" '1960': ['▁', '1960'],\n",
" '1960s': ['▁', '1960', 's'],\n",
" '1961': ['▁', '1', '9', '6', '1'],\n",
" '1963': ['▁', '1', '9', '6', '3'],\n",
" '19th': ['▁', '1', '9', 'th'],\n",
" '1superceded': ['▁', '1', 'superceded'],\n",
" \"1tho'\": ['▁', '1', 'tho', \"'\"],\n",
" '2': ['▁', '2'],\n",
" '2,000': ['▁', '2', ',', '0', '0', '0'],\n",
" '2,415,000,000': ['▁', '2', ',', '4', '1', '5', ',', '0', '00,000'],\n",
" '20': ['▁', '2', '0'],\n",
" '20-month-old': ['▁', '2', '0', '-', 'month', '-', 'old'],\n",
" '200': ['▁', '2', '0', '0'],\n",
" '20th-century': ['▁', '2', '0', 'th', '-', 'cent', 'ur', 'y'],\n",
" '21': ['▁', '2', '1'],\n",
" '210million': ['▁', '2', '10', 'million'],\n",
" '22': ['▁', '2', '2'],\n",
" '23.1': ['▁', '2', '3', '.', '1'],\n",
" '24': ['▁', '2', '4'],\n",
" '24-strong': ['▁', '2', '4', '-', 'strong'],\n",
" '25': ['▁', '2', '5'],\n",
" '27': ['▁', '2', '7'],\n",
" '28.5': ['▁', '2', '8', '.', '5'],\n",
" '280,000': ['▁', '2', '8', '0', ',', '0', '0', '0'],\n",
" '287': ['▁', '2', '8', '7'],\n",
" '288': ['▁', '2', '8', '8'],\n",
" '2bhoys': ['▁', '2', 'b', 'ho', 'y', 's'],\n",
" '2ole': ['▁', '2', 'o', 'le'],\n",
" '2pianna': ['▁', '2', 'p', 'i', 'an', 'n', 'a'],\n",
" '2skint': ['▁', '2', 's', 'k', 'in', 't'],\n",
" '3': ['▁', '3'],\n",
" '3,000': ['▁', '3', ',', '0', '0', '0'],\n",
" '3.6': ['▁', '3', '.', '6'],\n",
" '3/0': ['▁', '3', '/', '0'],\n",
" '3/4': ['▁', '3', '/', '4'],\n",
" '30': ['▁', '3', '0'],\n",
" '30-day': ['▁', '3', '0', '-', 'day'],\n",
" '30-minute': ['▁', '3', '0', '-', 'minute'],\n",
" '300,000': ['▁', '3', '00,000'],\n",
" '32': ['▁', '3', '2'],\n",
" '33': ['▁', '3', '3'],\n",
" '34': ['▁', '3', '4'],\n",
" '35': ['▁', '3', '5'],\n",
" '357million': ['▁', '3', '5', '7', 'million'],\n",
" '36': ['▁', '3', '6'],\n",
" '37,000,000': ['▁', '3', '7', ',', '0', '00,000'],\n",
" '37.2': ['▁', '3', '7', '.', '2'],\n",
" '38': ['▁', '3', '8'],\n",
" '4': ['▁', '4'],\n",
" '4.8': ['▁', '4', '.', '8'],\n",
" '40': ['▁', '4', '0'],\n",
" '400': ['▁', '4', '0', '0'],\n",
" '400,000': ['▁', '4', '00,000'],\n",
" '420000': ['▁', '4', '2', '0', '0', '0', '0'],\n",
" '43': ['▁', '4', '3'],\n",
" '450': ['▁', '4', '5', '0'],\n",
" '5': ['▁', '5'],\n",
" '5,000': ['▁', '5', ',', '0', '0', '0'],\n",
" '5.30': ['▁', '5', '.', '3', '0'],\n",
" '5/8': ['▁', '5', '/', '8'],\n",
" '50': ['▁', '5', '0'],\n",
" '50,000': ['▁', '5', '0', ',', '0', '0', '0'],\n",
" '500': ['▁', '5', '0', '0'],\n",
" '53-year-old': ['▁', '5', '3', '-', 'year', '-', 'old'],\n",
" '55': ['▁', '5', '5'],\n",
" '550,000': ['▁', '5', '5', '0', ',', '0', '0', '0'],\n",
" '58': ['▁', '5', '8'],\n",
" '6': ['▁', '6'],\n",
" '6,000': ['▁', '6', ',', '0', '0', '0'],\n",
" '60': ['▁', '6', '0'],\n",
" '600': ['▁', '6', '0', '0'],\n",
" '600,000': ['▁', '6', '00,000'],\n",
" '61-year-old': ['▁', '6', '1', '-', 'year', '-', 'old'],\n",
" '68': ['▁', '6', '8'],\n",
" '6al': ['▁', '6', 'al'],\n",
" '6tic': ['▁', '6', 'tic'],\n",
" '7.30': ['▁', '7', '.', '3', '0'],\n",
" '7.42': ['▁', '7', '.', '4', '2'],\n",
" '70': ['▁', '7', '0'],\n",
" '70,000,000': ['▁', '7', '0', ',', '0', '00,000'],\n",
" '707': ['▁', '7', '0', '7'],\n",
" '73': ['▁', '7', '3'],\n",
" '750': ['▁', '7', '5', '0'],\n",
" '8': ['▁', '8'],\n",
" '8,000,000': ['▁', '8', ',', '0', '00,000'],\n",
" '8.25': ['▁', '8', '.', '2', '5'],\n",
" '8.4': ['▁', '8', '.', '4'],\n",
" '80': ['▁', '8', '0'],\n",
" '800': ['▁', '8', '0', '0'],\n",
" '800,000': ['▁', '8', '00,000'],\n",
" '86': ['▁', '8', '6'],\n",
" '88': ['▁', '8', '8'],\n",
" '88-year-old': ['▁', '8', '8', '-', 'year', '-', 'old'],\n",
" '89': ['▁', '8', '9'],\n",
" '89-year-old': ['▁', '8', '9', '-', 'year', '-', 'old'],\n",
" '9.30': ['▁', '9', '.', '3', '0'],\n",
" '9.40': ['▁', '9', '.', '4', '0'],\n",
" '90-day': ['▁', '9', '0', '-', 'day'],\n",
" '90-minute': ['▁', '9', '0', '-', 'minute'],\n",
" '91': ['▁', '9', '1'],\n",
" '950': ['▁', '9', '5', '0'],\n",
" '97.5': ['▁', '9', '7', '.', '5'],\n",
" ':': ['▁', ':'],\n",
" ';': ['▁', ';'],\n",
" '?': ['▁', '?'],\n",
" 'a': ['▁', 'a'],\n",
" 'abandon': ['▁', 'a', 'b', 'and', 'on'],\n",
" 'abandoned': ['▁', 'a', 'b', 'and', 'on', 'ed'],\n",
" 'abandoning': ['▁', 'a', 'b', 'and', 'on', 'ing'],\n",
" 'abashed': ['▁', 'a', 'bas', 'he', 'd'],\n",
" 'ability': ['▁', 'a', 'b', 'il', 'ity'],\n",
" 'able': ['▁', 'able'],\n",
" 'able-bodied': ['▁', 'able', '-', 'bo', 'die', 'd'],\n",
" 'abolish': ['▁', 'a', 'bo', 'l', 'ish'],\n",
" 'abolished': ['▁', 'a', 'bo', 'l', 'ish', 'ed'],\n",
" 'abolition': ['▁', 'a', 'bo', 'li', 'tion'],\n",
" 'abortion': ['▁', 'a', 'b', 'or', 'tion'],\n",
" 'abou': ['▁', 'a', 'bo', 'u'],\n",
" 'about': ['▁', 'about'],\n",
" 'about-': ['▁', 'about', '-'],\n",
" 'above': ['▁', 'a', 'bo', 've'],\n",
" 'abreast': ['▁', 'a', 'br', 'east'],\n",
" 'abroad': ['▁', 'a', 'b', 'ro', 'ad'],\n",
" 'absence': ['▁', 'a', 'b', 's', 'ence'],\n",
" 'absent': ['▁', 'a', 'b', 's', 'ent'],\n",
" 'absolutely': ['▁', 'a', 'b', 'solut', 'e', 'ly'],\n",
" 'abstraction': ['▁', 'a', 'b', 's', 'tr', 'action'],\n",
" 'abundance': ['▁', 'a', 'b', 'un', 'd', 'ance'],\n",
" 'ac-': ['▁', 'ac', '-'],\n",
" 'academic': ['▁', 'ac', 'a', 'de', 'm', 'ic'],\n",
" 'accent': ['▁', 'ac', 'cent'],\n",
" 'accents': ['▁', 'ac', 'cent', 's'],\n",
" 'accept': ['▁', 'accept'],\n",
" 'acceptable': ['▁', 'accept', 'able'],\n",
" 'accepted': ['▁', 'accept', 'ed'],\n",
" 'accepting': ['▁', 'accept', 'ing'],\n",
" 'accessories': ['▁', 'ac', 'ce', 's', 'so', 'ries'],\n",
" 'accident': ['▁', 'ac', 'c', 'id', 'ent'],\n",
" 'accidental': ['▁', 'ac', 'c', 'id', 'ent', 'al'],\n",
" 'accommodate': ['▁', 'ac', 'com', 'mo', 'date'],\n",
" 'accommodation': ['▁', 'ac', 'com', 'mo', 'd', 'ation'],\n",
" 'accompanied': ['▁', 'ac', 'com', 'pan', 'i', 'ed'],\n",
" 'accompanist': ['▁', 'ac', 'com', 'pan', 'is', 't'],\n",
" 'accompany': ['▁', 'ac', 'com', 'p', 'any'],\n",
" 'accomplished': ['▁', 'ac', 'com', 'p', 'l', 'ish', 'ed'],\n",
" 'accomplishments': ['▁', 'ac', 'com', 'p', 'l', 'ish', 'ment', 's'],\n",
" 'according': ['▁', 'ac', 'c', 'or', 'd', 'ing'],\n",
" 'account': ['▁', 'ac', 'count'],\n",
" 'accountancy': ['▁', 'ac', 'count', 'an', 'c', 'y'],\n",
" 'accra': ['▁', 'ac', 'c', 'ra'],\n",
" \"accra's\": ['▁', 'ac', 'c', 'ra', \"'\", 's'],\n",
" 'accuracy': ['▁', 'ac', 'cur', 'ac', 'y'],\n",
" 'accurate': ['▁', 'ac', 'cur', 'ate'],\n",
" 'accurately': ['▁', 'ac', 'cur', 'ate', 'ly'],\n",
" 'accused': ['▁', 'ac', 'c', 'used'],\n",
" 'achieved': ['▁', 'a', 'ch', 'i', 'e', 'v', 'ed'],\n",
" 'achievement': ['▁', 'a', 'ch', 'i', 'e', 've', 'ment'],\n",
" 'acquaintance': ['▁', 'ac', 'q', 'u', 'a', 'in', 't', 'ance'],\n",
" 'acquaintances': ['▁', 'ac', 'q', 'u', 'a', 'in', 't', 'ance', 's'],\n",
" 'acres': ['▁', 'ac', 're', 's'],\n",
" 'across': ['▁', 'a', 'cross'],\n",
" 'act': ['▁', 'act'],\n",
" 'acting': ['▁', 'act', 'ing'],\n",
" 'action': ['▁', 'action'],\n",
" 'actions': ['▁', 'action', 's'],\n",
" 'active': ['▁', 'act', 'ive'],\n",
" 'activists': ['▁', 'act', 'i', 'vi', 'st', 's'],\n",
" 'activities': ['▁', 'act', 'i', 'v', 'it', 'ies'],\n",
" 'activity': ['▁', 'act', 'i', 'v', 'ity'],\n",
" 'acton': ['▁', 'act', 'on'],\n",
" 'actor': ['▁', 'act', 'or'],\n",
" 'actress': ['▁', 'act', 're', 's', 's'],\n",
" 'acts': ['▁', 'act', 's'],\n",
" 'actual': ['▁', 'act', 'ual'],\n",
" 'actually': ['▁', 'act', 'ual', 'ly'],\n",
" 'adamafio': ['▁', 'ad', 'a', 'ma', 'f', 'i', 'o'],\n",
" 'adaptation': ['▁', 'ad', 'ap', 't', 'ation'],\n",
" 'adapted': ['▁', 'ad', 'ap', 'ted'],\n",
" 'adapting': ['▁', 'ad', 'ap', 't', 'ing'],\n",
" 'add': ['▁', 'ad', 'd'],\n",
" 'added': ['▁', 'ad', 'd', 'ed'],\n",
" 'adding': ['▁', 'adding'],\n",
" 'addition': ['▁', 'ad', 'd', 'it', 'ion'],\n",
" 'additions': ['▁', 'ad', 'd', 'it', 'ion', 's'],\n",
" 'address': ['▁', 'ad', 'dr', 'es', 's'],\n",
" 'addressed': ['▁', 'ad', 'dr', 'es', 's', 'ed'],\n",
" 'addresses': ['▁', 'ad', 'dr', 'es', 'se', 's'],\n",
" 'addressing': ['▁', 'ad', 'dr', 'es', 's', 'ing'],\n",
" 'adenauer': ['▁', 'adenauer'],\n",
" \"adenauer's\": ['▁', 'adenauer', \"'\", 's'],\n",
" 'adequate': ['▁', 'ad', 'equa', 'te'],\n",
" 'adhem': ['▁', 'ad', 'he', 'm'],\n",
" 'adjust': ['▁', 'ad', 'just'],\n",
" 'adjustment': ['▁', 'ad', 'just', 'ment'],\n",
" 'administration': ['▁', 'ad', 'ministr', 'ation'],\n",
" \"administration's\": ['▁', 'ad', 'ministr', 'ation', \"'\", 's'],\n",
" 'administrative': ['▁', 'ad', 'ministr', 'at', 'ive'],\n",
" 'admiralty': ['▁', 'ad', 'm', 'i', 'r', 'al', 'ty'],\n",
" 'admire': ['▁', 'ad', 'm', 'i', 're'],\n",
" 'admit': ['▁', 'ad', 'm', 'it'],\n",
" 'admitted': ['▁', 'ad', 'm', 'it', 'ted'],\n",
" 'admitting': ['▁', 'ad', 'm', 'it', 't', 'ing'],\n",
" 'adopted': ['▁', 'a', 'do', 'p', 'ted'],\n",
" 'adopting': ['▁', 'a', 'do', 'p', 't', 'ing'],\n",
" 'adoption': ['▁', 'a', 'do', 'p', 'tion'],\n",
" 'adult': ['▁', 'ad', 'ul', 't'],\n",
" 'advance': ['▁', 'ad', 'v', 'ance'],\n",
" 'advanced': ['▁', 'ad', 'v', 'ance', 'd'],\n",
" 'advancing': ['▁', 'ad', 'v', 'an', 'c', 'ing'],\n",
" 'advantage': ['▁', 'advantage'],\n",
" 'advantages': ['▁', 'advantage', 's'],\n",
" 'advertisement': ['▁', 'ad', 'ver', 't', 'is', 'e', 'ment'],\n",
" 'advertisements': ['▁', 'ad', 'ver', 't', 'is', 'ements'],\n",
" 'advice': ['▁', 'advi', 'ce'],\n",
" 'advisability': ['▁', 'advi', 's', 'a', 'b', 'il', 'ity'],\n",
" 'advise': ['▁', 'advise'],\n",
" 'advised': ['▁', 'advise', 'd'],\n",
" 'advisers': ['▁', 'advise', 'r', 's'],\n",
" 'advocate': ['▁', 'ad', 'v', 'o', 'c', 'ate'],\n",
" 'af-': ['▁', 'a', 'f', '-'],\n",
" 'affairs': ['▁', 'a', 'f', 'f', 'air', 's'],\n",
" 'affected': ['▁', 'a', 'f', 'fe', 'c', 'ted'],\n",
" 'affection': ['▁', 'a', 'f', 'fe', 'c', 'tion'],\n",
" 'affilia-': ['▁', 'a', 'f', 'f', 'il', 'i', 'a', '-'],\n",
" 'affiliations': ['▁', 'a', 'f', 'f', 'il', 'i', 'ation', 's'],\n",
" 'affluence': ['▁', 'a', 'f', 'f', 'l', 'u', 'ence'],\n",
" 'affluent': ['▁', 'a', 'f', 'f', 'l', 'u', 'ent'],\n",
" 'afford': ['▁', 'a', 'f', 'for', 'd'],\n",
" 'afraid': ['▁', 'a', 'fr', 'a', 'id'],\n",
" 'africa': ['▁', 'africa'],\n",
" \"africa's\": ['▁', 'africa', \"'\", 's'],\n",
" 'african': ['▁', 'african'],\n",
" 'africans': ['▁', 'african', 's'],\n",
" 'after': ['▁', 'after'],\n",
" 'afternoon': ['▁', 'after', 'no', 'on'],\n",
" 'afterwards': ['▁', 'after', 'ward', 's'],\n",
" 'again': ['▁', 'again'],\n",
" 'against': ['▁', 'against'],\n",
" 'age': ['▁', 'age'],\n",
" 'age-structure': ['▁', 'age', '-', 's', 'tru', 'c', 'ture'],\n",
" 'aged': ['▁', 'aged'],\n",
" 'ageing': ['▁', 'age', 'ing'],\n",
" 'agent': ['▁', 'a', 'g', 'ent'],\n",
" 'agents': ['▁', 'a', 'g', 'ent', 's'],\n",
" 'ages': ['▁', 'age', 's'],\n",
" 'agitation': ['▁', 'a', 'g', 'it', 'ation'],\n",
" 'ago': ['▁', 'a', 'go'],\n",
" 'agree': ['▁', 'agree'],\n",
" 'agreed': ['▁', 'agree', 'd'],\n",
" 'agreement': ['▁', 'agree', 'ment'],\n",
" 'agreements': ['▁', 'agree', 'ment', 's'],\n",
" 'agriculture': ['▁', 'a', 'gr', 'ic', 'ul', 'ture'],\n",
" 'ahead': ['▁', 'a', 'head'],\n",
" 'aid': ['▁', 'a', 'id'],\n",
" 'aide': ['▁', 'a', 'i', 'de'],\n",
" 'aided': ['▁', 'a', 'id', 'ed'],\n",
" 'aides': ['▁', 'a', 'id', 'es'],\n",
" 'aim': ['▁', 'a', 'im'],\n",
" 'aimed': ['▁', 'a', 'im', 'ed'],\n",
" 'aiming': ['▁', 'a', 'im', 'ing'],\n",
" 'air': ['▁', 'air'],\n",
" 'aircraft': ['▁', 'air', 'craft'],\n",
" 'aired': ['▁', 'air', 'ed'],\n",
" \"airliner's\": ['▁', 'air', 'line', 'r', \"'\", 's'],\n",
" 'airmen': ['▁', 'air', 'men'],\n",
" 'airport': ['▁', 'air', 'port'],\n",
" 'akin': ['▁', 'a', 'k', 'in'],\n",
" \"aladdin's\": ['▁', 'al', 'ad', 'd', 'in', \"'\", 's'],\n",
" 'alan': ['▁', 'al', 'an'],\n",
" 'alarm': ['▁', 'al', 'arm'],\n",
" 'alarmed': ['▁', 'al', 'arm', 'ed'],\n",
" 'alas': ['▁', 'al', 'as'],\n",
" 'alcoholic': ['▁', 'al', 'co', 'ho', 'li', 'c'],\n",
" 'algeria': ['▁', 'al', 'g', 'er', 'i', 'a'],\n",
" 'alike': ['▁', 'a', 'like'],\n",
" 'alive': ['▁', 'a', 'live'],\n",
" 'all': ['▁', 'all'],\n",
" 'all-regular': ['▁', 'all', '-', 'regular'],\n",
" 'alleged': ['▁', 'al', 'leg', 'ed'],\n",
" 'allen': ['▁', 'all', 'en'],\n",
" 'alleviation': ['▁', 'alleviation'],\n",
" 'alley': ['▁', 'al', 'le', 'y'],\n",
" 'alliance': ['▁', 'all', 'i', 'ance'],\n",
" 'alliances': ['▁', 'all', 'i', 'ance', 's'],\n",
" 'allied': ['▁', 'all', 'i', 'ed'],\n",
" 'allies': ['▁', 'all', 'ies'],\n",
" 'allow': ['▁', 'allow'],\n",
" 'allowance': ['▁', 'allow', 'ance'],\n",
" 'allowances': ['▁', 'allow', 'ance', 's'],\n",
" 'allowed': ['▁', 'allow', 'ed'],\n",
" 'allowing': ['▁', 'allow', 'ing'],\n",
" 'ally': ['▁', 'al', 'ly'],\n",
" 'almost': ['▁', 'al', 'most'],\n",
" 'alone': ['▁', 'al', 'one'],\n",
" 'along': ['▁', 'a', 'long'],\n",
" 'alongside': ['▁', 'a', 'long', 'side'],\n",
" 'aloud': ['▁', 'a', 'lo', 'ud'],\n",
" 'already': ['▁', 'al', 'read', 'y'],\n",
" 'also': ['▁', 'also'],\n",
" 'alter': ['▁', 'al', 'ter'],\n",
" 'alternative': ['▁', 'al', 'ter', 'n', 'at', 'ive'],\n",
" 'alternatively': ['▁', 'al', 'ter', 'n', 'at', 'ive', 'ly'],\n",
" 'alternatives': ['▁', 'al', 'ter', 'n', 'at', 'ive', 's'],\n",
" 'although': ['▁', 'al', 'though'],\n",
" 'altogether': ['▁', 'al', 'together'],\n",
" 'altos': ['▁', 'al', 'to', 's'],\n",
" 'always': ['▁', 'always'],\n",
" 'am': ['▁', 'am'],\n",
" 'amateur': ['▁', 'am', 'ate', 'ur'],\n",
" 'amazed': ['▁', 'a', 'ma', 'z', 'ed'],\n",
" 'amazing': ['▁', 'a', 'ma', 'z', 'ing'],\n",
" 'ambassador': ['▁', 'am', 'bas', 's', 'ad', 'or'],\n",
" 'amber': ['▁', 'a', 'mber'],\n",
" 'ambition': ['▁', 'am', 'b', 'it', 'ion'],\n",
" 'ambitious': ['▁', 'am', 'b', 'it', 'i', 'ous'],\n",
" 'ambulance': ['▁', 'am', 'b', 'ul', 'ance'],\n",
" 'ambulances': ['▁', 'am', 'b', 'ul', 'ance', 's'],\n",
" 'america': ['▁', 'america'],\n",
" \"america's\": ['▁', 'america', \"'\", 's'],\n",
" 'american': ['▁', 'american'],\n",
" 'american-born': ['▁', 'american', '-', 'b', 'or', 'n'],\n",
" 'americans': ['▁', 'american', 's'],\n",
" 'amid': ['▁', 'am', 'id'],\n",
" 'ammunition': ['▁', 'am', 'm', 'un', 'it', 'ion'],\n",
" 'among': ['▁', 'among'],\n",
" 'amount': ['▁', 'a', 'mo', 'un', 't'],\n",
" 'ample': ['▁', 'amp', 'le'],\n",
" 'amusement': ['▁', 'am', 'use', 'ment'],\n",
" 'amusing': ['▁', 'am', 'us', 'ing'],\n",
" 'an': ['▁', 'an'],\n",
" 'analogy': ['▁', 'an', 'a', 'lo', 'g', 'y'],\n",
" 'analysed': ['▁', 'an', 'a', 'ly', 's', 'ed'],\n",
" 'anchor': ['▁', 'an', 'ch', 'or'],\n",
" 'ancient': ['▁', 'an', 'c', 'i', 'ent'],\n",
" 'and': ['▁', 'and'],\n",
" 'andrei': ['▁', 'and', 're', 'i'],\n",
" 'andrew': ['▁', 'and', 're', 'w'],\n",
" 'anecdotal': ['▁', 'an', 'e', 'c', 'do', 't', 'al'],\n",
" 'angel': ['▁', 'ang', 'el'],\n",
" 'angeles': ['▁', 'ang', 'el', 'es'],\n",
" 'angelo': ['▁', 'ang', 'e', 'lo'],\n",
" 'anger': ['▁', 'ang', 'er'],\n",
" 'anglais': ['▁', 'ang', 'la', 'is'],\n",
" 'angle': ['▁', 'ang', 'le'],\n",
" 'anglesey': ['▁', 'anglesey'],\n",
" \"anglesey's\": ['▁', 'anglesey', \"'\", 's'],\n",
" 'anglesey-road': ['▁', 'anglesey', '-', 'ro', 'ad'],\n",
" 'angola': ['▁', 'an', 'go', 'la'],\n",
" 'angrily': ['▁', 'an', 'gr', 'i', 'ly'],\n",
" 'angry': ['▁', 'ang', 'ry'],\n",
" 'ann': ['▁', 'an', 'n'],\n",
" 'anna': ['▁', 'an', 'n', 'a'],\n",
" 'announced': ['▁', 'an', 'no', 'un', 'c', 'ed'],\n",
" 'announcement': ['▁', 'an', 'no', 'un', 'ce', 'ment'],\n",
" 'announcing': ['▁', 'an', 'no', 'un', 'c', 'ing'],\n",
" 'annoyed': ['▁', 'an', 'no', 'y', 'ed'],\n",
" 'annual': ['▁', 'an', 'n', 'ual'],\n",
" 'another': ['▁', 'another'],\n",
" 'answer': ['▁', 'answer'],\n",
" 'answered': ['▁', 'answer', 'ed'],\n",
" 'answering': ['▁', 'answer', 'ing'],\n",
" 'antagonism': ['▁', 'ant', 'a', 'g', 'on', 'is', 'm'],\n",
" 'anthony': ['▁', 'an', 'th', 'on', 'y'],\n",
" 'anti-apartheid': ['▁', 'ant', 'i', '-', 'a', 'part', 'he', 'id'],\n",
" 'anti-bomb': ['▁', 'ant', 'i', '-', 'bomb'],\n",
" 'anti-german': ['▁', 'ant', 'i', '-', 'german'],\n",
" 'anti-nato': ['▁', 'ant', 'i', '-', 'nato'],\n",
" 'anti-negro': ['▁', 'ant', 'i', '-', 'negro'],\n",
" 'anti-nuclear': ['▁', 'ant', 'i', '-', 'nuclear'],\n",
" 'anti-soviet': ['▁', 'ant', 'i', '-', 'soviet'],\n",
" 'anti-tory': ['▁', 'ant', 'i', '-', 'tory'],\n",
" 'anticipation': ['▁', 'an', 'tic', 'ip', 'ation'],\n",
" 'antonioni': ['▁', 'ant', 'on', 'ion', 'i'],\n",
" \"antonioni's\": ['▁', 'ant', 'on', 'ion', 'i', \"'\", 's'],\n",
" 'any': ['▁', 'any'],\n",
" 'any-': ['▁', 'any', '-'],\n",
" 'anybody': ['▁', 'any', 'body'],\n",
" \"anybody's\": ['▁', 'any', 'body', \"'\", 's'],\n",
" 'anyone': ['▁', 'any', 'one'],\n",
" 'anything': ['▁', 'any', 'thing'],\n",
" 'anyway': ['▁', 'any', 'way'],\n",
" 'apart': ['▁', 'a', 'part'],\n",
" 'apartheid': ['▁', 'a', 'part', 'he', 'id'],\n",
" 'apathetic': ['▁', 'a', 'pa', 'the', 'tic'],\n",
" 'apathy': ['▁', 'a', 'pa', 'th', 'y'],\n",
" 'apex': ['▁', 'ap', 'ex'],\n",
" 'apocalypse': ['▁', 'a', 'po', 'c', 'a', 'ly', 'p', 'se'],\n",
" 'apologising': ['▁', 'a', 'po', 'lo', 'g', 'is', 'ing'],\n",
" 'appalled': ['▁', 'app', 'all', 'ed'],\n",
" 'appalling': ['▁', 'app', 'all', 'ing'],\n",
" 'apparatus': ['▁', 'app', 'ar', 'at', 'us'],\n",
" 'apparent': ['▁', 'app', 'ar', 'ent'],\n",
" 'apparently': ['▁', 'app', 'ar', 'ent', 'ly'],\n",
" 'appeal': ['▁', 'appeal'],\n",
" 'appealing': ['▁', 'appeal', 'ing'],\n",
" 'appeals': ['▁', 'appeal', 's'],\n",
" 'appear': ['▁', 'appear'],\n",
" 'appearance': ['▁', 'appear', 'ance'],\n",
" 'appeared': ['▁', 'appear', 'ed'],\n",
" 'appears': ['▁', 'appear', 's'],\n",
" 'appeasement': ['▁', 'app', 'e', 'a', 'se', 'ment'],\n",
" 'applauding': ['▁', 'app', 'la', 'ud', 'ing'],\n",
" 'appliances': ['▁', 'app', 'li', 'ance', 's'],\n",
" 'application': ['▁', 'app', 'li', 'c', 'ation'],\n",
" 'applications': ['▁', 'app', 'li', 'c', 'ation', 's'],\n",
" 'applied': ['▁', 'app', 'li', 'ed'],\n",
" 'apply': ['▁', 'app', 'ly'],\n",
" 'appointed': ['▁', 'ap', 'point', 'ed'],\n",
" 'appointment': ['▁', 'ap', 'point', 'ment'],\n",
" 'appreciable': ['▁', 'app', 're', 'c', 'i', 'able'],\n",
" 'appreciably': ['▁', 'app', 're', 'c', 'i', 'ably'],\n",
" 'appreciated': ['▁', 'app', 're', 'c', 'i', 'at', 'ed'],\n",
" 'appreciation': ['▁', 'app', 're', 'c', 'i', 'ation'],\n",
" 'apprenticeships': ['▁', 'app', 'r', 'ent', 'i', 'ce', 'ship', 's'],\n",
" 'approach': ['▁', 'ap', 'pro', 'a', 'ch'],\n",
" 'approached': ['▁', 'ap', 'pro', 'a', 'ch', 'ed'],\n",
" 'approaches': ['▁', 'ap', 'pro', 'a', 'che', 's'],\n",
" 'appropriate': ['▁', 'ap', 'pro', 'pri', 'ate'],\n",
" 'appropriated': ['▁', 'ap', 'pro', 'pri', 'at', 'ed'],\n",
" 'approval': ['▁', 'ap', 'pro', 'val'],\n",
" 'approximately': ['▁', 'ap', 'pro', 'x', 'im', 'ate', 'ly'],\n",
" 'april': ['▁', 'a', 'pri', 'l'],\n",
" 'archbishop': ['▁', 'ar', 'ch', 'b', 'is', 'hop'],\n",
" 'arches': ['▁', 'ar', 'che', 's'],\n",
" 'archipelago': ['▁', 'ar', 'ch', 'i', 'pe', 'la', 'go'],\n",
" 'architect': ['▁', 'ar', 'ch', 'it', 'e', 'c', 't'],\n",
" 'architecture': ['▁', 'ar', 'ch', 'it', 'e', 'c', 'ture'],\n",
" 'are': ['▁', 'are'],\n",
" 'area': ['▁', 'are', 'a'],\n",
" 'areas': ['▁', 'are', 'as'],\n",
" \"aren't\": ['▁', 'are', 'n', \"'\", 't'],\n",
" 'arguably': ['▁', 'ar', 'gu', 'ably'],\n",
" 'argued': ['▁', 'ar', 'gu', 'ed'],\n",
" 'argues': ['▁', 'ar', 'gu', 'es'],\n",
" 'arguing': ['▁', 'ar', 'gu', 'ing'],\n",
" 'argument': ['▁', 'ar', 'gu', 'ment'],\n",
" 'arguments': ['▁', 'ar', 'gu', 'ment', 's'],\n",
" 'arise': ['▁', 'a', 'rise'],\n",
" 'arises': ['▁', 'a', 'rise', 's'],\n",
" 'arm': ['▁', 'arm'],\n",
" 'armament': ['▁', 'arm', 'a', 'ment'],\n",
" 'armaments': ['▁', 'arm', 'a', 'ment', 's'],\n",
" 'armed': ['▁', 'arm', 'ed'],\n",
" 'armoured': ['▁', 'arm', 'our', 'ed'],\n",
" 'arms': ['▁', 'arm', 's'],\n",
" \"arms'\": ['▁', 'arm', 's', \"'\"],\n",
" 'army': ['▁', 'arm', 'y'],\n",
" 'arnold': ['▁', 'ar', 'n', 'old'],\n",
" 'arose': ['▁', 'a', 'ro', 'se'],\n",
" 'around': ['▁', 'a', 'round'],\n",
" 'aroused': ['▁', 'ar', 'ous', 'ed'],\n",
" 'arrange': ['▁', 'ar', 'range'],\n",
" 'arranged': ['▁', 'ar', 'range', 'd'],\n",
" 'arrangement': ['▁', 'ar', 'range', 'ment'],\n",
" 'arrangements': ['▁', 'ar', 'range', 'ment', 's'],\n",
" 'arranging': ['▁', 'ar', 'r', 'ang', 'ing'],\n",
" 'arrears': ['▁', 'ar', 're', 'ar', 's'],\n",
" 'arrested': ['▁', 'ar', 'rest', 'ed'],\n",
" 'arrival': ['▁', 'ar', 'r', 'i', 'val'],\n",
" 'arrive': ['▁', 'ar', 'r', 'ive'],\n",
" 'arrived': ['▁', 'arrived'],\n",
" 'arrives': ['▁', 'ar', 'r', 'ive', 's'],\n",
" 'arrogant': ['▁', 'ar', 'ro', 'g', 'ant'],\n",
" 'art': ['▁', 'ar', 't'],\n",
" 'arthur': ['▁', 'ar', 'th', 'ur'],\n",
" 'article': ['▁', 'ar', 'tic', 'le'],\n",
" 'articles': ['▁', 'ar', 'tic', 'le', 's'],\n",
" 'articulation': ['▁', 'ar', 'tic', 'ul', 'ation'],\n",
" 'artistic': ['▁', 'ar', 'tist', 'ic'],\n",
" 'artistically': ['▁', 'ar', 'tist', 'ical', 'ly'],\n",
" 'artistry': ['▁', 'ar', 'tist', 'ry'],\n",
" 'artists': ['▁', 'ar', 'tist', 's'],\n",
" 'as': ['▁', 'as'],\n",
" 'ascents': ['▁', 'as', 'cent', 's'],\n",
" 'ash': ['▁', 'as', 'h'],\n",
" 'ashen': ['▁', 'as', 'he', 'n'],\n",
" 'ask': ['▁', 'as', 'k'],\n",
" 'asked': ['▁', 'asked'],\n",
" 'asking': ['▁', 'asking'],\n",
" 'aspect': ['▁', 'a', 'spect'],\n",
" 'aspects': ['▁', 'a', 'spect', 's'],\n",
" 'aspiring': ['▁', 'as', 'p', 'i', 'r', 'ing'],\n",
" 'assault': ['▁', 'as', 's', 'a', 'ul', 't'],\n",
" 'assembler': ['▁', 'as', 'se', 'm', 'bl', 'er'],\n",
" 'assembly': ['▁', 'as', 'se', 'm', 'b', 'ly'],\n",
" 'assess': ['▁', 'as', 'se', 's', 's'],\n",
" 'assessment': ['▁', 'as', 'se', 's', 's', 'ment'],\n",
" 'assistance': ['▁', 'as', 's', 'istance'],\n",
" 'assistant': ['▁', 'as', 's', 'is', 't', 'ant'],\n",
" 'assistants': ['▁', 'as', 's', 'is', 't', 'ant', 's'],\n",
" 'associate': ['▁', 'associat', 'e'],\n",
" 'associated': ['▁', 'associat', 'ed'],\n",
" 'associates': ['▁', 'associat', 'es'],\n",
" 'association': ['▁', 'associat', 'ion'],\n",
" 'assortment': ['▁', 'as', 's', 'or', 't', 'ment'],\n",
" 'assumption': ['▁', 'assumption'],\n",
" 'assurance': ['▁', 'as', 's', 'ur', 'ance'],\n",
" 'astronaut': ['▁', 'as', 'tr', 'on', 'a', 'u', 't'],\n",
" 'astute': ['▁', 'a', 'st', 'u', 'te'],\n",
" 'at': ['▁', 'at'],\n",
" 'ately': ['▁', 'ate', 'ly'],\n",
" 'atkinson': ['▁', 'at', 'k', 'in', 's', 'on'],\n",
" 'atlantic': ['▁', 'at', 'l', 'an', 'tic'],\n",
" 'atmosphere': ['▁', 'atmospher', 'e'],\n",
" 'atmospheric': ['▁', 'atmospher', 'ic'],\n",
" 'atomic': ['▁', 'a', 'to', 'm', 'ic'],\n",
" 'atoms': ['▁', 'a', 'to', 'm', 's'],\n",
" 'attach': ['▁', 'at', 't', 'a', 'ch'],\n",
" 'attached': ['▁', 'at', 't', 'a', 'ch', 'ed'],\n",
" 'attack': ['▁', 'at', 't', 'a', 'ck'],\n",
" 'attacked': ['▁', 'at', 't', 'a', 'ck', 'ed'],\n",
" 'attacks': ['▁', 'at', 't', 'a', 'ck', 's'],\n",
" 'attainable': ['▁', 'at', 'tain', 'able'],\n",
" 'attempt': ['▁', 'attempt'],\n",
" 'attempted': ['▁', 'attempt', 'ed'],\n",
" 'attempting': ['▁', 'attempt', 'ing'],\n",
" 'attempts': ['▁', 'attempt', 's'],\n",
" 'atten-': ['▁', 'at', 'ten', '-'],\n",
" 'attend': ['▁', 'at', 't', 'end'],\n",
" 'attendance': ['▁', 'at', 't', 'end', 'ance'],\n",
" 'attended': ['▁', 'at', 't', 'end', 'ed'],\n",
" 'attending': ['▁', 'at', 't', 'end', 'ing'],\n",
" 'attention': ['▁', 'at', 'ten', 'tion'],\n",
" 'attitude': ['▁', 'at', 't', 'it', 'u', 'de'],\n",
" 'attitudes': ['▁', 'at', 't', 'it', 'ud', 'es'],\n",
" 'attracted': ['▁', 'at', 'tr', 'act', 'ed'],\n",
" 'attractive': ['▁', 'at', 'tr', 'act', 'ive'],\n",
" 'aubrey': ['▁', 'a', 'u', 'b', 're', 'y'],\n",
" 'audacity': ['▁', 'a', 'ud', 'ac', 'ity'],\n",
" 'auden': ['▁', 'a', 'ud', 'en'],\n",
" 'audience': ['▁', 'a', 'ud', 'i', 'ence'],\n",
" 'audio-tv': ['▁', 'a', 'ud', 'i', 'o', '-', 't', 'v'],\n",
" 'audited': ['▁', 'a', 'ud', 'it', 'ed'],\n",
" 'august': ['▁', 'a', 'ug', 'u', 'st'],\n",
" 'auntie': ['▁', 'a', 'un', 't', 'i', 'e'],\n",
" 'austerity': ['▁', 'a', 'u', 'ster', 'ity'],\n",
" 'australia': ['▁', 'a', 'us', 'tr', 'al', 'i', 'a'],\n",
" 'austria': ['▁', 'a', 'us', 'tri', 'a'],\n",
" 'austrian': ['▁', 'a', 'us', 'tri', 'an'],\n",
" 'authentic': ['▁', 'a', 'u', 'then', 'tic'],\n",
" 'author': ['▁', 'author'],\n",
" 'authorised': ['▁', 'author', 'is', 'ed'],\n",
" 'authorities': ['▁', 'author', 'it', 'ies'],\n",
" 'authority': ['▁', 'author', 'ity'],\n",
" 'automatically': ['▁', 'a', 'u', 'to', 'm', 'at', 'ical', 'ly'],\n",
" 'automation': ['▁', 'a', 'u', 'to', 'm', 'ation'],\n",
" 'autumn': ['▁', 'a', 'u', 't', 'um', 'n'],\n",
" 'available': ['▁', 'a', 'v', 'a', 'il', 'able'],\n",
" 'avenue': ['▁', 'a', 've', 'n', 'ue'],\n",
" 'average': ['▁', 'a', 'ver', 'age'],\n",
" 'averages': ['▁', 'a', 'ver', 'age', 's'],\n",
" 'avert': ['▁', 'a', 'ver', 't'],\n",
" 'aviation': ['▁', 'a', 'vi', 'ation'],\n",
" 'avoid': ['▁', 'a', 'v', 'o', 'id'],\n",
" 'avoided': ['▁', 'a', 'v', 'o', 'id', 'ed'],\n",
" 'avon': ['▁', 'a', 'v', 'on'],\n",
" 'awake': ['▁', 'a', 'w', 'a', 'ke'],\n",
" 'awarded': ['▁', 'a', 'ward', 'ed'],\n",
" 'awards': ['▁', 'a', 'ward', 's'],\n",
" 'aware': ['▁', 'a', 'w', 'are'],\n",
" 'awareness': ['▁', 'a', 'w', 'are', 'ness'],\n",
" 'away': ['▁', 'a', 'way'],\n",
" 'awful': ['▁', 'a', 'w', 'ful'],\n",
" 'awfully': ['▁', 'a', 'w', 'ful', 'ly'],\n",
" 'b': ['▁', 'b'],\n",
" 'b.': ['▁', 'b', '.'],\n",
" 'b.b.c.': ['▁', 'b', '.', 'b', '.', 'c', '.'],\n",
" 'babe': ['▁', 'b', 'a', 'be'],\n",
" 'babel': ['▁', 'b', 'a', 'be', 'l'],\n",
" 'bably': ['▁', 'b', 'ably'],\n",
" 'baby': ['▁', 'b', 'a', 'by'],\n",
" \"baby's\": ['▁', 'b', 'a', 'by', \"'\", 's'],\n",
" 'back': ['▁', 'back'],\n",
" 'backbone': ['▁', 'back', 'b', 'one'],\n",
" 'backed': ['▁', 'back', 'ed'],\n",
" 'backers': ['▁', 'back', 'ers'],\n",
" 'background': ['▁', 'back', 'ground'],\n",
" 'backing': ['▁', 'back', 'ing'],\n",
" 'backstage': ['▁', 'back', 'st', 'age'],\n",
" 'backward': ['▁', 'back', 'ward'],\n",
" 'bad': ['▁', 'b', 'ad'],\n",
" 'badly': ['▁', 'b', 'ad', 'ly'],\n",
" 'baffled': ['▁', 'b', 'a', 'f', 'f', 'led'],\n",
" 'bag': ['▁', 'b', 'a', 'g'],\n",
" 'bagaya': ['▁', 'b', 'a', 'gay', 'a'],\n",
" 'baker': ['▁', 'b', 'a', 'k', 'er'],\n",
" 'balance': ['▁', 'b', 'al', 'ance'],\n",
" 'balance-sheet': ['▁', 'b', 'al', 'ance', '-', 'she', 'e', 't'],\n",
" 'balances': ['▁', 'b', 'al', 'ance', 's'],\n",
" 'bald': ['▁', 'b', 'al', 'd'],\n",
" 'ball': ['▁', 'b', 'all'],\n",
" 'balloon': ['▁', 'b', 'all', 'o', 'on'],\n",
" 'ballyhoo': ['▁', 'b', 'al', 'ly', 'ho', 'o'],\n",
" 'baltic': ['▁', 'b', 'al', 'tic'],\n",
" 'ban': ['▁', 'b', 'an'],\n",
" 'ban-': ['▁', 'b', 'an', '-'],\n",
" 'ban-the-': ['▁', 'b', 'an', '-', 'the', '-'],\n",
" 'ban-the-bomb': ['▁', 'b', 'an', '-', 'the', '-', 'bomb'],\n",
" 'bank': ['▁', 'bank'],\n",
" \"bank's\": ['▁', 'bank', \"'\", 's'],\n",
" 'banking': ['▁', 'bank', 'ing'],\n",
" 'bankrupt': ['▁', 'bank', 'r', 'up', 't'],\n",
" 'banks': ['▁', 'bank', 's'],\n",
" \"banks'\": ['▁', 'bank', 's', \"'\"],\n",
" 'banned': ['▁', 'b', 'an', 'n', 'ed'],\n",
" 'banzie': ['▁', 'b', 'an', 'z', 'i', 'e'],\n",
" 'bar': ['▁', 'b', 'ar'],\n",
" 'barb': ['▁', 'b', 'ar', 'b'],\n",
" 'barbara': ['▁', 'b', 'ar', 'b', 'ar', 'a'],\n",
" 'barbarously': ['▁', 'b', 'ar', 'b', 'ar', 'ous', 'ly'],\n",
" 'barclay': ['▁', 'b', 'ar', 'clay'],\n",
" 'bare': ['▁', 'b', 'are'],\n",
" 'bargain': ['▁', 'b', 'ar', 'g', 'a', 'in'],\n",
" 'bargaining': ['▁', 'b', 'ar', 'g', 'a', 'in', 'ing'],\n",
" 'bark': ['▁', 'b', 'ar', 'k'],\n",
" 'barrier': ['▁', 'b', 'ar', 'r', 'i', 'er'],\n",
" 'barriers': ['▁', 'b', 'ar', 'r', 'i', 'ers'],\n",
" 'barry': ['▁', 'b', 'a', 'rry'],\n",
" 'base': ['▁', 'base'],\n",
" 'based': ['▁', 'bas', 'ed'],\n",
" 'bases': ['▁', 'base', 's'],\n",
" 'basic': ['▁', 'bas', 'ic'],\n",
" 'basin': ['▁', 'bas', 'in'],\n",
" 'basing': ['▁', 'bas', 'ing'],\n",
" 'basis': ['▁', 'bas', 'is'],\n",
" 'baskerville': ['▁', 'bas', 'k', 'er', 'v', 'il', 'le'],\n",
" 'basses': ['▁', 'bas', 'se', 's'],\n",
" 'basting': ['▁', 'bas', 't', 'ing'],\n",
" 'bathing': ['▁', 'b', 'a', 'thing'],\n",
" 'bats': ['▁', 'b', 'at', 's'],\n",
" 'batsman': ['▁', 'b', 'at', 's', 'man'],\n",
" 'battalions': ['▁', 'b', 'at', 't', 'al', 'ion', 's'],\n",
" 'batting': ['▁', 'b', 'at', 't', 'ing'],\n",
" 'battle': ['▁', 'b', 'a', 'ttle'],\n",
" 'bavaria': ['▁', 'b', 'a', 'v', 'ar', 'i', 'a'],\n",
" 'bavarian': ['▁', 'b', 'a', 'v', 'ar', 'i', 'an'],\n",
" 'bavarians': ['▁', 'b', 'a', 'v', 'ar', 'i', 'an', 's'],\n",
" 'bay': ['▁', 'b', 'a', 'y'],\n",
" 'be': ['▁', 'be'],\n",
" 'beach': ['▁', 'b', 'each'],\n",
" 'beaches': ['▁', 'b', 'each', 'es'],\n",
" 'beacon': ['▁', 'be', 'a', 'con'],\n",
" 'beaks': ['▁', 'be', 'a', 'k', 's'],\n",
" 'bean': ['▁', 'be', 'an'],\n",
" 'bear': ['▁', 'be', 'ar'],\n",
" 'bearer': ['▁', 'be', 'are', 'r'],\n",
" 'bears': ['▁', 'be', 'ar', 's'],\n",
" 'beastly': ['▁', 'b', 'east', 'ly'],\n",
" 'beasts': ['▁', 'b', 'east', 's'],\n",
" 'beaten': ['▁', 'be', 'a', 'ten'],\n",
" 'beautiful': ['▁', 'be', 'a', 'u', 't', 'i', 'ful'],\n",
" 'beautifully': ['▁', 'be', 'a', 'u', 't', 'i', 'ful', 'ly'],\n",
" 'beauty': ['▁', 'be', 'a', 'u', 'ty'],\n",
" 'became': ['▁', 'be', 'came'],\n",
" 'because': ['▁', 'because'],\n",
" 'beckoning': ['▁', 'be', 'ck', 'on', 'ing'],\n",
" 'become': ['▁', 'be', 'come'],\n",
" 'becomes': ['▁', 'be', 'come', 's'],\n",
" 'becoming': ['▁', 'be', 'com', 'ing'],\n",
" 'bed': ['▁', 'b', 'ed'],\n",
" 'bedlam': ['▁', 'b', 'ed', 'la', 'm'],\n",
" 'beds': ['▁', 'b', 'ed', 's'],\n",
" 'bedspreads': ['▁', 'b', 'ed', 's', 'p', 'read', 's'],\n",
" 'beech': ['▁', 'be', 'e', 'ch'],\n",
" 'been': ['▁', 'been'],\n",
" 'before': ['▁', 'before'],\n",
" 'befriended': ['▁', 'be', 'friend', 'ed'],\n",
" 'began': ['▁', 'be', 'g', 'an'],\n",
" 'begin': ['▁', 'be', 'g', 'in'],\n",
" 'beginner': ['▁', 'be', 'g', 'in', 'n', 'er'],\n",
" 'beginning': ['▁', 'be', 'g', 'in', 'n', 'ing'],\n",
" 'begins': ['▁', 'be', 'g', 'in', 's'],\n",
" 'begun': ['▁', 'be', 'g', 'un'],\n",
" 'behan': ['▁', 'be', 'h', 'an'],\n",
" 'behave': ['▁', 'be', 'have'],\n",
" 'behaviour': ['▁', 'be', 'h', 'a', 'vi', 'our'],\n",
" 'behind': ['▁', 'behind'],\n",
" 'beier': ['▁', 'be', 'i', 'er'],\n",
" 'being': ['▁', 'being'],\n",
" 'belgian': ['▁', 'be', 'l', 'g', 'i', 'an'],\n",
" 'belgium': ['▁', 'be', 'l', 'giu', 'm'],\n",
" 'belgrade': ['▁', 'be', 'l', 'gr', 'a', 'de'],\n",
" 'belief': ['▁', 'be', 'li', 'e', 'f'],\n",
" 'believe': ['▁', 'believe'],\n",
" 'believed': ['▁', 'believed'],\n",
" 'believes': ['▁', 'believe', 's'],\n",
" 'bell': ['▁', 'be', 'll'],\n",
" \"bell's\": ['▁', 'be', 'll', \"'\", 's'],\n",
" 'belmondo': ['▁', 'be', 'l', 'mon', 'do'],\n",
" 'belonged': ['▁', 'be', 'long', 'ed'],\n",
" 'belongs': ['▁', 'be', 'long', 's'],\n",
" 'below': ['▁', 'be', 'low'],\n",
" 'belt': ['▁', 'be', 'l', 't'],\n",
" 'ben': ['▁', 'be', 'n'],\n",
" 'bench': ['▁', 'be', 'n', 'ch'],\n",
" 'benches': ['▁', 'be', 'n', 'che', 's'],\n",
" 'bend': ['▁', 'b', 'end'],\n",
" 'bending': ['▁', 'b', 'end', 'ing'],\n",
" 'benefits': ['▁', 'be', 'ne', 'f', 'its'],\n",
" 'bent': ['▁', 'b', 'ent'],\n",
" 'ber': ['▁', 'be', 'r'],\n",
" 'berlin': ['▁', 'berlin'],\n",
" \"berlin's\": ['▁', 'berlin', \"'\", 's'],\n",
" 'bernhard': ['▁', 'be', 'r', 'n', 'hard'],\n",
" 'berry': ['▁', 'be', 'rry'],\n",
" 'bertrand': ['▁', 'bert', 'r', 'and'],\n",
" 'beset': ['▁', 'be', 'set'],\n",
" 'beside': ['▁', 'be', 'side'],\n",
" 'best': ['▁', 'best'],\n",
" 'best-seller': ['▁', 'best', '-', 's', 'ell', 'er'],\n",
" 'bet': ['▁', 'be', 't'],\n",
" 'betjeman': ['▁', 'be', 't', 'je', 'man'],\n",
" 'betrayal': ['▁', 'be', 'tr', 'a', 'y', 'al'],\n",
" 'betrayed': ['▁', 'be', 'tr', 'a', 'y', 'ed'],\n",
" 'better': ['▁', 'better'],\n",
" 'better-': ['▁', 'better', '-'],\n",
" \"betti's\": ['▁', 'be', 't', 't', 'i', \"'\", 's'],\n",
" 'between': ['▁', 'between'],\n",
" 'bevel': ['▁', 'be', 've', 'l'],\n",
" 'bevelled': ['▁', 'be', 'v', 'ell', 'ed'],\n",
" 'beware': ['▁', 'be', 'w', 'are'],\n",
" 'bewildered': ['▁', 'be', 'w', 'il', 'd', 'er', 'ed'],\n",
" 'beyond': ['▁', 'beyond'],\n",
" 'bidet': ['▁', 'b', 'i', 'de', 't'],\n",
" 'big': ['▁', 'big'],\n",
" 'bigger': ['▁', 'big', 'g', 'er'],\n",
" 'biggest': ['▁', 'big', 'g', 'est'],\n",
" 'bill': ['▁', 'b', 'ill'],\n",
" 'bills': ['▁', 'b', 'ill', 's'],\n",
" 'binding': ['▁', 'b', 'in', 'd', 'ing'],\n",
" 'biological': ['▁', 'b', 'i', 'o', 'lo', 'g', 'ical'],\n",
" 'bird': ['▁', 'b', 'i', 'r', 'd'],\n",
" 'birds': ['▁', 'b', 'i', 'r', 'd', 's'],\n",
" 'bishop': ['▁', 'b', 'is', 'hop'],\n",
" 'bit': ['▁', 'b', 'it'],\n",
" 'bite': ['▁', 'b', 'it', 'e'],\n",
" 'bits': ['▁', 'b', 'its'],\n",
" 'bitter-sweet': ['▁', 'b', 'it', 'ter', '-', 's', 'we', 'e', 't'],\n",
" 'bitterest': ['▁', 'b', 'it', 'ter', 'est'],\n",
" 'bitterly': ['▁', 'b', 'it', 'ter', 'ly'],\n",
" 'bituminized': ['▁', 'b', 'it', 'um', 'in', 'i', 'z', 'ed'],\n",
" 'black': ['▁', 'bl', 'a', 'ck'],\n",
" 'black-': ['▁', 'bl', 'a', 'ck', '-'],\n",
" 'black-listed': ['▁', 'bl', 'a', 'ck', '-', 'li', 'st', 'ed'],\n",
" 'blackbird': ['▁', 'bl', 'a', 'ck', 'b', 'i', 'r', 'd'],\n",
" 'blacks': ['▁', 'bl', 'a', 'ck', 's'],\n",
" 'blame': ['▁', 'bl', 'a', 'me'],\n",
" 'blamed': ['▁', 'bl', 'am', 'ed'],\n",
" 'blander': ['▁', 'bl', 'and', 'er'],\n",
" 'blank': ['▁', 'bl', 'an', 'k'],\n",
" 'blend': ['▁', 'bl', 'end'],\n",
" 'blight': ['▁', 'b', 'light'],\n",
" 'blind': ['▁', 'bl', 'in', 'd'],\n",
" 'blinked': ['▁', 'bl', 'in', 'k', 'ed'],\n",
" 'block': ['▁', 'block'],\n",
" 'blocks': ['▁', 'block', 's'],\n",
" 'bloem-': ['▁', 'b', 'lo', 'e', 'm', '-'],\n",
" 'blond': ['▁', 'bl', 'on', 'd'],\n",
" 'blood': ['▁', 'b', 'lo', 'od'],\n",
" 'bloodstained': ['▁', 'b', 'lo', 'od', 's', 'tain', 'ed'],\n",
" 'bloody': ['▁', 'b', 'lo', 'od', 'y'],\n",
" 'blouse': ['▁', 'b', 'lo', 'use'],\n",
" 'blouses': ['▁', 'bl', 'ous', 'es'],\n",
" 'blow': ['▁', 'b', 'low'],\n",
" 'blowflies': ['▁', 'b', 'low', 'f', 'l', 'ies'],\n",
" 'blown': ['▁', 'bl', 'own'],\n",
" 'blue': ['▁', 'bl', 'ue'],\n",
" 'blunt': ['▁', 'bl', 'un', 't'],\n",
" 'bluntly': ['▁', 'bl', 'un', 't', 'ly'],\n",
" 'bluster': ['▁', 'bl', 'u', 'ster'],\n",
" 'board': ['▁', 'board'],\n",
" 'boat': ['▁', 'bo', 'at'],\n",
" 'boat-train': ['▁', 'bo', 'at', '-', 'train'],\n",
" 'bobby': ['▁', 'bo', 'b', 'by'],\n",
" 'bodies': ['▁', 'bo', 'd', 'ies'],\n",
" 'body': ['▁', 'body'],\n",
" 'boeing': ['▁', 'bo', 'e', 'ing'],\n",
" 'bogy': ['▁', 'bo', 'g', 'y'],\n",
" 'boiled': ['▁', 'bo', 'il', 'ed'],\n",
" 'boils': ['▁', 'bo', 'il', 's'],\n",
" 'bold': ['▁', 'b', 'old'],\n",
" 'boldly': ['▁', 'b', 'old', 'ly'],\n",
" 'bolt': ['▁', 'bo', 'l', 't'],\n",
" 'bolted': ['▁', 'bo', 'l', 'ted'],\n",
" 'bomb': ['▁', 'bomb'],\n",
" 'bombay': ['▁', 'bomb', 'a', 'y'],\n",
" 'bombed': ['▁', 'bomb', 'ed'],\n",
" 'bombers': ['▁', 'bomb', 'ers'],\n",
" 'bonded': ['▁', 'b', 'on', 'd', 'ed'],\n",
" 'bone': ['▁', 'b', 'one'],\n",
" 'bones': ['▁', 'b', 'one', 's'],\n",
" 'bonn': ['▁', 'b', 'on', 'n'],\n",
" \"bonn's\": ['▁', 'b', 'on', 'n', \"'\", 's'],\n",
" 'book': ['▁', 'book'],\n",
" 'booklet': ['▁', 'book', 'le', 't'],\n",
" 'books': ['▁', 'book', 's'],\n",
" 'booming': ['▁', 'bo', 'o', 'm', 'ing'],\n",
" 'border': ['▁', 'b', 'order'],\n",
" 'bore': ['▁', 'bo', 're'],\n",
" 'bored': ['▁', 'b', 'or', 'ed'],\n",
" 'boredom': ['▁', 'bo', 're', 'do', 'm'],\n",
" 'bores': ['▁', 'bo', 're', 's'],\n",
" 'born': ['▁', 'b', 'or', 'n'],\n",
" 'borough': ['▁', 'bo', 'rough'],\n",
" 'borrow': ['▁', 'b', 'or', 'ro', 'w'],\n",
" 'borstal': ['▁', 'b', 'or', 'st', 'al'],\n",
" 'bosoms': ['▁', 'bo', 'so', 'm', 's'],\n",
" 'bossed': ['▁', 'bo', 's', 's', 'ed'],\n",
" 'bosses': ['▁', 'bo', 's', 'se', 's'],\n",
" 'both': ['▁', 'both'],\n",
" 'bottle': ['▁', 'bo', 'ttle'],\n",
" 'bottom': ['▁', 'bo', 't', 'to', 'm'],\n",
" 'bought': ['▁', 'bo', 'ug', 'h', 't'],\n",
" 'boun': ['▁', 'bo', 'un'],\n",
" 'bound': ['▁', 'b', 'ound'],\n",
" 'boutiques': ['▁', 'b', 'out', 'i', 'q', 'ue', 's'],\n",
" 'bow': ['▁', 'bo', 'w'],\n",
" 'bow-street': ['▁', 'bo', 'w', '-', 'st', 're', 'e', 't'],\n",
" 'bowed': ['▁', 'bo', 'w', 'ed'],\n",
" 'bowing': ['▁', 'bo', 'w', 'ing'],\n",
" 'bows': ['▁', 'bo', 'w', 's'],\n",
" 'box': ['▁', 'bo', 'x'],\n",
" 'boxes': ['▁', 'bo', 'x', 'es'],\n",
" 'boxing': ['▁', 'bo', 'x', 'ing'],\n",
" 'boy': ['▁', 'bo', 'y'],\n",
" 'boycotted': ['▁', 'bo', 'y', 'cott', 'ed'],\n",
" 'boycotting': ['▁', 'bo', 'y', 'cott', 'ing'],\n",
" 'boyd-orr': ['▁', 'bo', 'y', 'd', '-', 'or', 'r'],\n",
" 'boyle': ['▁', 'bo', 'y', 'le'],\n",
" 'boys': ['▁', 'bo', 'y', 's'],\n",
" 'braces': ['▁', 'br', 'a', 'ce', 's'],\n",
" 'brain': ['▁', 'b', 'rain'],\n",
" 'brain-activity': ['▁', 'b', 'rain', '-', 'act', 'i', 'v', 'ity'],\n",
" 'brain-children': ['▁', 'b', 'rain', '-', 'children'],\n",
" 'brains': ['▁', 'b', 'rain', 's'],\n",
" 'brandy': ['▁', 'br', 'and', 'y'],\n",
" 'brash': ['▁', 'br', 'as', 'h'],\n",
" 'brass': ['▁', 'br', 'as', 's'],\n",
" 'brauchitsch': ['▁', 'br', 'a', 'u', 'ch', 'its', 'ch'],\n",
" 'breach': ['▁', 'br', 'each'],\n",
" 'bread-and-butter': ['▁', 'b', 'read', '-', 'and', '-', 'but', 'ter'],\n",
" 'break': ['▁', 'b', 're', 'a', 'k'],\n",
" 'breaking': ['▁', 'b', 're', 'a', 'k', 'ing'],\n",
" 'breaks': ['▁', 'b', 're', 'a', 'k', 's'],\n",
" 'breath': ['▁', 'b', 're', 'a', 'th'],\n",
" 'breathing': ['▁', 'b', 're', 'a', 'thing'],\n",
" 'breathless': ['▁', 'b', 're', 'a', 'th', 'less'],\n",
" 'breeding': ['▁', 'b', 're', 'ed', 'ing'],\n",
" 'breezily': ['▁', 'b', 're', 'e', 'z', 'i', 'ly'],\n",
" 'brehm': ['▁', 'b', 're', 'h', 'm'],\n",
" 'brella': ['▁', 'br', 'ell', 'a'],\n",
" 'brenda': ['▁', 'br', 'end', 'a'],\n",
" 'brendan': ['▁', 'br', 'end', 'an'],\n",
" \"brendan's\": ['▁', 'br', 'end', 'an', \"'\", 's'],\n",
" 'brentano': ['▁', 'br', 'ent', 'a', 'no'],\n",
" 'brezhnev': ['▁', 'b', 're', 'z', 'h', 'ne', 'v'],\n",
" 'brian': ['▁', 'br', 'i', 'an'],\n",
" 'bridal': ['▁', 'br', 'id', 'al'],\n",
" 'bride': ['▁', 'br', 'i', 'de'],\n",
" 'brief': ['▁', 'brief'],\n",
" 'brief-': ['▁', 'brief', '-'],\n",
" 'briefcase': ['▁', 'brief', 'case'],\n",
" 'briefing': ['▁', 'brief', 'ing'],\n",
" 'brigadiers': ['▁', 'br', 'i', 'g', 'ad', 'i', 'ers'],\n",
" 'bright': ['▁', 'b', 'right'],\n",
" 'brighter': ['▁', 'b', 'right', 'er'],\n",
" 'brightly': ['▁', 'b', 'right', 'ly'],\n",
" \"brighton's\": ['▁', 'b', 'right', 'on', \"'\", 's'],\n",
" 'brilliant': ['▁', 'br', 'ill', 'i', 'ant'],\n",
" 'brilliantly': ['▁', 'br', 'ill', 'i', 'ant', 'ly'],\n",
" 'bring': ['▁', 'br', 'ing'],\n",
" 'brings': ['▁', 'br', 'ing', 's'],\n",
" 'bristled': ['▁', 'br', 'is', 't', 'led'],\n",
" 'bristol': ['▁', 'br', 'is', 'to', 'l'],\n",
" 'britain': ['▁', 'britain'],\n",
" \"britain's\": ['▁', 'britain', \"'\", 's'],\n",
" 'british': ['▁', 'british'],\n",
" 'british-owned': ['▁', 'british', '-', 'own', 'ed'],\n",
" 'britishers': ['▁', 'british', 'ers'],\n",
" 'brittle': ['▁', 'br', 'i', 'ttle'],\n",
" 'broad': ['▁', 'b', 'ro', 'ad'],\n",
" 'broadcast': ['▁', 'b', 'ro', 'ad', 'c', 'a', 'st'],\n",
" 'broadcasting': ['▁', 'b', 'ro', 'ad', 'c', 'a', 'st', 'ing'],\n",
" 'broke': ['▁', 'b', 'ro', 'ke'],\n",
" 'broken': ['▁', 'b', 'ro', 'k', 'en'],\n",
" 'bronx': ['▁', 'br', 'on', 'x'],\n",
" \"brook's\": ['▁', 'b', 'ro', 'o', 'k', \"'\", 's'],\n",
" 'brother': ['▁', 'brother'],\n",
" 'brother-': ['▁', 'brother', '-'],\n",
" 'brother-in-law': ['▁', 'brother', '-', 'in', '-', 'law'],\n",
" 'brought': ['▁', 'brought'],\n",
" 'brown': ['▁', 'brown'],\n",
" \"brown's\": ['▁', 'brown', \"'\", 's'],\n",
" 'bru\"cke': ['▁', 'br', 'u', '\"', 'ck', 'e'],\n",
" 'bruce': ['▁', 'br', 'u', 'ce'],\n",
" 'bruno': ['▁', 'br', 'un', 'o'],\n",
" 'brunswick': ['▁', 'br', 'un', 's', 'w', 'i', 'ck'],\n",
" 'brussels': ['▁', 'br', 'us', 's', 'el', 's'],\n",
" 'brutal': ['▁', 'br', 'u', 't', 'al'],\n",
" 'bryan': ['▁', 'br', 'y', 'an'],\n",
" 'bu\"ckerei': ['▁', 'b', 'u', '\"', 'ck', 'e', 're', 'i'],\n",
" 'buck': ['▁', 'b', 'u', 'ck'],\n",
" 'buckingham': ['▁', 'b', 'u', 'ck', 'ing', 'h', 'am'],\n",
" 'buckley': ['▁', 'b', 'u', 'ck', 'le', 'y'],\n",
" 'budge': ['▁', 'b', 'ud', 'g', 'e'],\n",
" 'budgerigar': ['▁', 'b', 'ud', 'g', 'er', 'i', 'g', 'ar'],\n",
" 'budget': ['▁', 'budget'],\n",
" 'budgetary': ['▁', 'budget', 'ary'],\n",
" 'budgette': ['▁', 'budget', 'te'],\n",
" 'buganda': ['▁', 'b', 'ug', 'and', 'a'],\n",
" 'build': ['▁', 'b', 'u', 'il', 'd'],\n",
" 'building': ['▁', 'building'],\n",
" ...}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lex"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
|