diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2022-09-13 18:43:21 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2022-09-13 18:43:21 +0200 |
commit | bec6649f8202dd1a12209ed2e489ea4d22fc69e0 (patch) | |
tree | 332534ec203cd0d7eb2b4aba7f2b1384c4d9c0d4 /notebooks/04-conv-transformer.ipynb | |
parent | 7440994bede38a6c6743110ec4215524665733e1 (diff) |
Update notebooks
Diffstat (limited to 'notebooks/04-conv-transformer.ipynb')
-rw-r--r-- | notebooks/04-conv-transformer.ipynb | 237 |
1 files changed, 140 insertions, 97 deletions
diff --git a/notebooks/04-conv-transformer.ipynb b/notebooks/04-conv-transformer.ipynb index 3303b63..8ded6b6 100644 --- a/notebooks/04-conv-transformer.ipynb +++ b/notebooks/04-conv-transformer.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "3cf50475-39f2-4642-a7d1-5bcbc0a036f7", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 35, "id": "e52ecb01-c975-4e55-925d-1182c7aea473", "metadata": {}, "outputs": [], @@ -61,17 +61,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 36, "id": "f939aa37-7b1d-45cc-885c-323c4540bda1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'_target_': 'text_recognizer.networks.ConvTransformer', 'input_dims': [1, 1, 576, 640], 'hidden_dim': 128, 'num_classes': 58, 'pad_index': 3, 'encoder': {'_target_': 'text_recognizer.networks.EfficientNet', 'arch': 'b0', 'stochastic_dropout_rate': 0.2, 'bn_momentum': 0.99, 'bn_eps': 0.001, 'depth': 5, 'out_channels': 128, 'stride': [2, 1]}, 'decoder': {'_target_': 'text_recognizer.networks.transformer.Decoder', 'depth': 6, 'block': {'_target_': 'text_recognizer.networks.transformer.DecoderBlock', 'self_attn': {'_target_': 'text_recognizer.networks.transformer.Attention', 'dim': 128, 'num_heads': 8, 'dim_head': 64, 'dropout_rate': 0.4, 'causal': True, 'rotary_embedding': {'_target_': 'text_recognizer.networks.transformer.RotaryEmbedding', 'dim': 64}}, 'cross_attn': {'_target_': 'text_recognizer.networks.transformer.Attention', 'dim': 128, 'num_heads': 8, 'dim_head': 64, 'dropout_rate': 0.4, 'causal': False}, 'norm': {'_target_': 'text_recognizer.networks.transformer.RMSNorm', 'dim': 128}, 'ff': {'_target_': 'text_recognizer.networks.transformer.FeedForward', 'dim': 128, 'dim_out': None, 'expansion_factor': 2, 'glu': True, 'dropout_rate': 0.4}}}, 'pixel_embedding': {'_target_': 'text_recognizer.networks.transformer.AxialPositionalEmbedding', 'dim': 128, 'shape': [18, 80]}}" + "{'_target_': 'text_recognizer.networks.ConvTransformer', 'input_dims': [1, 1, 576, 640], 'hidden_dim': 144, 'num_classes': 58, 'pad_index': 3, 'encoder': {'_target_': 'text_recognizer.networks.EfficientNet', 'arch': 'b0', 'stochastic_dropout_rate': 0.2, 'bn_momentum': 0.99, 'bn_eps': 0.001, 'depth': 5, 'out_channels': 144}, 'decoder': {'_target_': 'text_recognizer.networks.transformer.Decoder', 'depth': 6, 'block': {'_target_': 'text_recognizer.networks.transformer.DecoderBlock', 'self_attn': {'_target_': 'text_recognizer.networks.transformer.Attention', 'dim': 144, 'num_heads': 12, 'dim_head': 64, 'dropout_rate': 0.2, 'causal': True, 'rotary_embedding': {'_target_': 'text_recognizer.networks.transformer.RotaryEmbedding', 'dim': 64}}, 'cross_attn': {'_target_': 'text_recognizer.networks.transformer.Attention', 'dim': 144, 'num_heads': 8, 'dim_head': 64, 'dropout_rate': 0.2, 'causal': False}, 'norm': {'_target_': 'text_recognizer.networks.transformer.RMSNorm', 'dim': 144}, 'ff': {'_target_': 'text_recognizer.networks.transformer.FeedForward', 'dim': 144, 'dim_out': None, 'expansion_factor': 2, 'glu': True, 'dropout_rate': 0.2}}}, 'pixel_embedding': {'_target_': 'text_recognizer.networks.transformer.embeddings.axial.AxialPositionalEmbeddingImage', 'dim': 144, 'axial_shape': [3, 63], 'axial_dims': [72, 72]}, 'token_pos_embedding': {'_target_': 'text_recognizer.networks.transformer.embeddings.fourier.PositionalEncoding', 'dim': 144, 'dropout_rate': 0.1, 'max_len': 89}}" ] }, - "execution_count": 10, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 37, "id": "aaeab329-aeb0-4a1b-aa35-5a2aab81b1d0", "metadata": { "scrolled": false @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 38, "id": "618b997c-e6a6-4487-b70c-9d260cb556d3", "metadata": {}, "outputs": [], @@ -104,122 +104,165 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "25759b7b-8deb-4163-b75d-a1357c9fe88f", - "metadata": { - "scrolled": false - }, + "execution_count": 39, + "id": "7daf1f49", + "metadata": {}, "outputs": [ { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 144, 3, 63])\n" + ] + }, + { "data": { "text/plain": [ "==============================================================================================================\n", "Layer (type:depth-idx) Output Shape Param #\n", "==============================================================================================================\n", - "ConvTransformer [1, 58, 682] --\n", - "├─EfficientNet: 1-1 [1, 128, 18, 80] 850,044\n", - "│ └─Sequential: 2-1 [1, 16, 288, 320] --\n", - "│ │ └─ZeroPad2d: 3-1 [1, 1, 577, 641] --\n", - "│ │ └─Conv2d: 3-2 [1, 16, 288, 320] 144\n", - "│ │ └─BatchNorm2d: 3-3 [1, 16, 288, 320] 32\n", - "│ │ └─Mish: 3-4 [1, 16, 288, 320] --\n", + "ConvTransformer [1, 58, 89] --\n", + "├─EfficientNet: 1-1 [1, 144, 3, 63] 850,880\n", + "│ └─Sequential: 2-1 [1, 32, 26, 510] --\n", + "│ │ └─ZeroPad2d: 3-1 [1, 1, 57, 1025] --\n", + "│ │ └─Conv2d: 3-2 [1, 32, 26, 510] 1,568\n", + "│ │ └─BatchNorm2d: 3-3 [1, 32, 26, 510] 64\n", + "│ │ └─Mish: 3-4 [1, 32, 26, 510] --\n", "│ └─ModuleList: 2 -- --\n", - "│ │ └─MBConvBlock: 3-5 [1, 16, 288, 320] --\n", - "│ │ │ └─Depthwise: 4-1 [1, 16, 288, 320] 176\n", - "│ │ │ └─SqueezeAndExcite: 4-2 [1, 16, 288, 320] 148\n", - "│ │ │ └─Pointwise: 4-3 [1, 16, 288, 320] 288\n", - "│ │ └─MBConvBlock: 3-6 [1, 24, 144, 160] --\n", - "│ │ │ └─InvertedBottleneck: 4-4 [1, 96, 288, 320] 1,728\n", - "│ │ │ └─Depthwise: 4-5 [1, 96, 144, 160] 1,056\n", - "│ │ │ └─SqueezeAndExcite: 4-6 [1, 96, 144, 160] 868\n", - "│ │ │ └─Pointwise: 4-7 [1, 24, 144, 160] 2,352\n", - "│ │ └─MBConvBlock: 3-7 [1, 24, 144, 160] --\n", - "│ │ │ └─InvertedBottleneck: 4-8 [1, 144, 144, 160] 3,744\n", - "│ │ │ └─Depthwise: 4-9 [1, 144, 144, 160] 1,584\n", - "│ │ │ └─SqueezeAndExcite: 4-10 [1, 144, 144, 160] 1,878\n", - "│ │ │ └─Pointwise: 4-11 [1, 24, 144, 160] 3,504\n", - "│ │ └─MBConvBlock: 3-8 [1, 40, 72, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-12 [1, 144, 144, 160] 3,744\n", - "│ │ │ └─Depthwise: 4-13 [1, 144, 72, 80] 3,888\n", - "│ │ │ └─SqueezeAndExcite: 4-14 [1, 144, 72, 80] 1,878\n", - "│ │ │ └─Pointwise: 4-15 [1, 40, 72, 80] 5,840\n", - "│ │ └─MBConvBlock: 3-9 [1, 40, 72, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-16 [1, 240, 72, 80] 10,080\n", - "│ │ │ └─Depthwise: 4-17 [1, 240, 72, 80] 6,480\n", - "│ │ │ └─SqueezeAndExcite: 4-18 [1, 240, 72, 80] 5,050\n", - "│ │ │ └─Pointwise: 4-19 [1, 40, 72, 80] 9,680\n", - "│ │ └─MBConvBlock: 3-10 [1, 80, 36, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-20 [1, 240, 72, 80] 10,080\n", - "│ │ │ └─Depthwise: 4-21 [1, 240, 36, 80] 2,640\n", - "│ │ │ └─SqueezeAndExcite: 4-22 [1, 240, 36, 80] 5,050\n", - "│ │ │ └─Pointwise: 4-23 [1, 80, 36, 80] 19,360\n", - "│ │ └─MBConvBlock: 3-11 [1, 80, 36, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-24 [1, 480, 36, 80] 39,360\n", - "│ │ │ └─Depthwise: 4-25 [1, 480, 36, 80] 5,280\n", - "│ │ │ └─SqueezeAndExcite: 4-26 [1, 480, 36, 80] 19,700\n", - "│ │ │ └─Pointwise: 4-27 [1, 80, 36, 80] 38,560\n", - "│ │ └─MBConvBlock: 3-12 [1, 80, 36, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-28 [1, 480, 36, 80] 39,360\n", - "│ │ │ └─Depthwise: 4-29 [1, 480, 36, 80] 5,280\n", - "│ │ │ └─SqueezeAndExcite: 4-30 [1, 480, 36, 80] 19,700\n", - "│ │ │ └─Pointwise: 4-31 [1, 80, 36, 80] 38,560\n", - "│ │ └─MBConvBlock: 3-13 [1, 112, 18, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-32 [1, 480, 36, 80] 39,360\n", - "│ │ │ └─Depthwise: 4-33 [1, 480, 18, 80] 12,960\n", - "│ │ │ └─SqueezeAndExcite: 4-34 [1, 480, 18, 80] 19,700\n", - "│ │ │ └─Pointwise: 4-35 [1, 112, 18, 80] 53,984\n", - "│ │ └─MBConvBlock: 3-14 [1, 112, 18, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-36 [1, 672, 18, 80] 76,608\n", - "│ │ │ └─Depthwise: 4-37 [1, 672, 18, 80] 18,144\n", - "│ │ │ └─SqueezeAndExcite: 4-38 [1, 672, 18, 80] 38,332\n", - "│ │ │ └─Pointwise: 4-39 [1, 112, 18, 80] 75,488\n", - "│ │ └─MBConvBlock: 3-15 [1, 112, 18, 80] --\n", - "│ │ │ └─InvertedBottleneck: 4-40 [1, 672, 18, 80] 76,608\n", - "│ │ │ └─Depthwise: 4-41 [1, 672, 18, 80] 18,144\n", - "│ │ │ └─SqueezeAndExcite: 4-42 [1, 672, 18, 80] 38,332\n", - "│ │ │ └─Pointwise: 4-43 [1, 112, 18, 80] 75,488\n", - "│ └─Sequential: 2-2 [1, 128, 18, 80] --\n", - "│ │ └─Conv2d: 3-16 [1, 128, 18, 80] 14,336\n", - "│ │ └─BatchNorm2d: 3-17 [1, 128, 18, 80] 256\n", - "│ │ └─Dropout: 3-18 [1, 128, 18, 80] --\n", - "├─Conv2d: 1-2 [1, 128, 18, 80] 16,512\n", - "├─AxialPositionalEmbedding: 1-3 [1, 128, 18, 80] 12,544\n", - "├─Embedding: 1-4 [1, 682, 128] 7,424\n", - "├─Decoder: 1-5 [1, 682, 128] --\n", + "│ │ └─MBConvBlock: 3-5 [1, 16, 26, 510] --\n", + "│ │ │ └─Depthwise: 4-1 [1, 32, 26, 510] 352\n", + "│ │ │ └─SqueezeAndExcite: 4-2 [1, 32, 26, 510] 552\n", + "│ │ │ └─Pointwise: 4-3 [1, 16, 26, 510] 544\n", + "│ │ └─MBConvBlock: 3-6 [1, 24, 13, 255] --\n", + "│ │ │ └─InvertedBottleneck: 4-4 [1, 96, 26, 510] 1,728\n", + "│ │ │ └─Depthwise: 4-5 [1, 96, 13, 255] 1,056\n", + "│ │ │ └─SqueezeAndExcite: 4-6 [1, 96, 13, 255] 868\n", + "│ │ │ └─Pointwise: 4-7 [1, 24, 13, 255] 2,352\n", + "│ │ └─MBConvBlock: 3-7 [1, 24, 13, 255] --\n", + "│ │ │ └─InvertedBottleneck: 4-8 [1, 144, 13, 255] 3,744\n", + "│ │ │ └─Depthwise: 4-9 [1, 144, 13, 255] 1,584\n", + "│ │ │ └─SqueezeAndExcite: 4-10 [1, 144, 13, 255] 1,878\n", + "│ │ │ └─Pointwise: 4-11 [1, 24, 13, 255] 3,504\n", + "│ │ └─MBConvBlock: 3-8 [1, 40, 6, 127] --\n", + "│ │ │ └─InvertedBottleneck: 4-12 [1, 144, 13, 255] 3,744\n", + "│ │ │ └─Depthwise: 4-13 [1, 144, 6, 127] 3,888\n", + "│ │ │ └─SqueezeAndExcite: 4-14 [1, 144, 6, 127] 1,878\n", + "│ │ │ └─Pointwise: 4-15 [1, 40, 6, 127] 5,840\n", + "│ │ └─MBConvBlock: 3-9 [1, 40, 6, 127] --\n", + "│ │ │ └─InvertedBottleneck: 4-16 [1, 240, 6, 127] 10,080\n", + "│ │ │ └─Depthwise: 4-17 [1, 240, 6, 127] 6,480\n", + "│ │ │ └─SqueezeAndExcite: 4-18 [1, 240, 6, 127] 5,050\n", + "│ │ │ └─Pointwise: 4-19 [1, 40, 6, 127] 9,680\n", + "│ │ └─MBConvBlock: 3-10 [1, 80, 3, 63] --\n", + "│ │ │ └─InvertedBottleneck: 4-20 [1, 240, 6, 127] 10,080\n", + "│ │ │ └─Depthwise: 4-21 [1, 240, 3, 63] 2,640\n", + "│ │ │ └─SqueezeAndExcite: 4-22 [1, 240, 3, 63] 5,050\n", + "│ │ │ └─Pointwise: 4-23 [1, 80, 3, 63] 19,360\n", + "│ │ └─MBConvBlock: 3-11 [1, 80, 3, 63] --\n", + "│ │ │ └─InvertedBottleneck: 4-24 [1, 480, 3, 63] 39,360\n", + "│ │ │ └─Depthwise: 4-25 [1, 480, 3, 63] 5,280\n", + "│ │ │ └─SqueezeAndExcite: 4-26 [1, 480, 3, 63] 19,700\n", + "│ │ │ └─Pointwise: 4-27 [1, 80, 3, 63] 38,560\n", + "│ │ └─MBConvBlock: 3-12 [1, 80, 3, 63] --\n", + "│ │ │ └─InvertedBottleneck: 4-28 [1, 480, 3, 63] 39,360\n", + "│ │ │ └─Depthwise: 4-29 [1, 480, 3, 63] 5,280\n", + "│ │ │ └─SqueezeAndExcite: 4-30 [1, 480, 3, 63] 19,700\n", + "│ │ │ └─Pointwise: 4-31 [1, 80, 3, 63] 38,560\n", + "│ │ └─MBConvBlock: 3-13 [1, 112, 3, 63] --\n", + "│ │ │ └─InvertedBottleneck: 4-32 [1, 480, 3, 63] 39,360\n", + "│ │ │ └─Depthwise: 4-33 [1, 480, 3, 63] 12,960\n", + "│ │ │ └─SqueezeAndExcite: 4-34 [1, 480, 3, 63] 19,700\n", + "│ │ │ └─Pointwise: 4-35 [1, 112, 3, 63] 53,984\n", + "│ │ └─MBConvBlock: 3-14 [1, 112, 3, 63] --\n", + "│ │ │ └─InvertedBottleneck: 4-36 [1, 672, 3, 63] 76,608\n", + "│ │ │ └─Depthwise: 4-37 [1, 672, 3, 63] 18,144\n", + "│ │ │ └─SqueezeAndExcite: 4-38 [1, 672, 3, 63] 38,332\n", + "│ │ │ └─Pointwise: 4-39 [1, 112, 3, 63] 75,488\n", + "│ │ └─MBConvBlock: 3-15 [1, 112, 3, 63] --\n", + "│ │ │ └─InvertedBottleneck: 4-40 [1, 672, 3, 63] 76,608\n", + "│ │ │ └─Depthwise: 4-41 [1, 672, 3, 63] 18,144\n", + "│ │ │ └─SqueezeAndExcite: 4-42 [1, 672, 3, 63] 38,332\n", + "│ │ │ └─Pointwise: 4-43 [1, 112, 3, 63] 75,488\n", + "│ └─Sequential: 2-2 [1, 144, 3, 63] --\n", + "│ │ └─Conv2d: 3-16 [1, 144, 3, 63] 16,128\n", + "│ │ └─BatchNorm2d: 3-17 [1, 144, 3, 63] 288\n", + "│ │ └─Dropout: 3-18 [1, 144, 3, 63] --\n", + "├─Conv2d: 1-2 [1, 144, 3, 63] 20,880\n", + "├─AxialPositionalEmbeddingImage: 1-3 [1, 144, 3, 63] --\n", + "│ └─AxialPositionalEmbedding: 2-3 [1, 189, 144] 4,752\n", + "├─Embedding: 1-4 [1, 89, 144] 8,352\n", + "├─PositionalEncoding: 1-5 [1, 89, 144] --\n", + "│ └─Dropout: 2-4 [1, 89, 144] --\n", + "├─Decoder: 1-6 [1, 89, 144] --\n", "│ └─ModuleList: 2 -- --\n", - "│ │ └─DecoderBlock: 3-19 [1, 682, 128] --\n", - "│ │ └─DecoderBlock: 3-20 [1, 682, 128] --\n", - "│ │ └─DecoderBlock: 3-21 [1, 682, 128] --\n", - "│ │ └─DecoderBlock: 3-22 [1, 682, 128] --\n", - "│ │ └─DecoderBlock: 3-23 [1, 682, 128] --\n", - "│ │ └─DecoderBlock: 3-24 [1, 682, 128] --\n", - "├─Linear: 1-6 [1, 682, 58] 7,482\n", + "│ │ └─DecoderBlock: 3-19 [1, 89, 144] --\n", + "│ │ └─DecoderBlock: 3-20 [1, 89, 144] --\n", + "│ │ └─DecoderBlock: 3-21 [1, 89, 144] --\n", + "│ │ └─DecoderBlock: 3-22 [1, 89, 144] --\n", + "│ │ └─DecoderBlock: 3-23 [1, 89, 144] --\n", + "│ │ └─DecoderBlock: 3-24 [1, 89, 144] --\n", + "├─Linear: 1-7 [1, 89, 58] 8,410\n", "==============================================================================================================\n", - "Total params: 4,652,006\n", - "Trainable params: 4,652,006\n", + "Total params: 6,090,138\n", + "Trainable params: 6,090,138\n", "Non-trainable params: 0\n", - "Total mult-adds (G): 2.44\n", + "Total mult-adds (M): 313.64\n", "==============================================================================================================\n", - "Input size (MB): 1.48\n", - "Forward/backward pass size (MB): 1041.70\n", - "Params size (MB): 18.61\n", - "Estimated Total Size (MB): 1061.78\n", + "Input size (MB): 0.23\n", + "Forward/backward pass size (MB): 145.27\n", + "Params size (MB): 24.36\n", + "Estimated Total Size (MB): 169.86\n", "==============================================================================================================" ] }, - "execution_count": 13, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "summary(net, ((1, 1, 56, 1024), (1, 89)), device=\"cpu\", depth=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "25759b7b-8deb-4163-b75d-a1357c9fe88f", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "Failed to run torchinfo. See above stack traces for more details. Executed layers up to: [EfficientNet: 1, Sequential: 2, ZeroPad2d: 3, Conv2d: 3, BatchNorm2d: 3, Mish: 3, MBConvBlock: 3, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Sequential: 2, Conv2d: 3, BatchNorm2d: 3, Dropout: 3, Conv2d: 1]", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/text-recognizer-ejNaVa9M-py3.9/lib/python3.9/site-packages/torchinfo/torchinfo.py:290\u001b[0m, in \u001b[0;36mforward_pass\u001b[0;34m(model, x, batch_dim, cache_forward_pass, device, mode, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(x, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[0;32m--> 290\u001b[0m _ \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mdict\u001b[39m):\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/text-recognizer-ejNaVa9M-py3.9/lib/python3.9/site-packages/torch/nn/modules/module.py:1148\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m bw_hook\u001b[38;5;241m.\u001b[39msetup_input_hook(\u001b[38;5;28minput\u001b[39m)\n\u001b[0;32m-> 1148\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1149\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks:\n", + "File \u001b[0;32m~/projects/text-recognizer/text_recognizer/networks/conv_transformer.py:132\u001b[0m, in \u001b[0;36mConvTransformer.forward\u001b[0;34m(self, x, context)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;124;03m\"\"\"Encodes images into word piece logtis.\u001b[39;00m\n\u001b[1;32m 117\u001b[0m \n\u001b[1;32m 118\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;124;03m Tensor: Sequence of logits.\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m--> 132\u001b[0m z \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 133\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecode(z, context)\n", + "File \u001b[0;32m~/projects/text-recognizer/text_recognizer/networks/conv_transformer.py:82\u001b[0m, in \u001b[0;36mConvTransformer.encode\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 81\u001b[0m z \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv(z)\n\u001b[0;32m---> 82\u001b[0m z \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpixel_embedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mz\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 83\u001b[0m z \u001b[38;5;241m=\u001b[39m z\u001b[38;5;241m.\u001b[39mflatten(start_dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/text-recognizer-ejNaVa9M-py3.9/lib/python3.9/site-packages/torch/nn/modules/module.py:1148\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m bw_hook\u001b[38;5;241m.\u001b[39msetup_input_hook(\u001b[38;5;28minput\u001b[39m)\n\u001b[0;32m-> 1148\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1149\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks:\n", + "File \u001b[0;32m~/projects/text-recognizer/text_recognizer/networks/transformer/embeddings/axial.py:40\u001b[0m, in \u001b[0;36mAxialPositionalEmbedding.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[0;32m---> 40\u001b[0m b, t, _ \u001b[38;5;241m=\u001b[39m x\u001b[38;5;241m.\u001b[39mshape\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m (\n\u001b[1;32m 42\u001b[0m t \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_seq_len\n\u001b[1;32m 43\u001b[0m ), \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSequence length (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mt\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) must be less than the maximum sequence length allowed (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_seq_len\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 3)", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [22]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msummary\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnet\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m576\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m640\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m682\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdepth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/text-recognizer-ejNaVa9M-py3.9/lib/python3.9/site-packages/torchinfo/torchinfo.py:218\u001b[0m, in \u001b[0;36msummary\u001b[0;34m(model, input_size, input_data, batch_dim, cache_forward_pass, col_names, col_width, depth, device, dtypes, mode, row_settings, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 211\u001b[0m validate_user_params(\n\u001b[1;32m 212\u001b[0m input_data, input_size, columns, col_width, device, dtypes, verbose\n\u001b[1;32m 213\u001b[0m )\n\u001b[1;32m 215\u001b[0m x, correct_input_size \u001b[38;5;241m=\u001b[39m process_input(\n\u001b[1;32m 216\u001b[0m input_data, input_size, batch_dim, device, dtypes\n\u001b[1;32m 217\u001b[0m )\n\u001b[0;32m--> 218\u001b[0m summary_list \u001b[38;5;241m=\u001b[39m \u001b[43mforward_pass\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 219\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_dim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_forward_pass\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel_mode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 220\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 221\u001b[0m formatting \u001b[38;5;241m=\u001b[39m FormattingOptions(depth, verbose, columns, col_width, rows)\n\u001b[1;32m 222\u001b[0m results \u001b[38;5;241m=\u001b[39m ModelStatistics(\n\u001b[1;32m 223\u001b[0m summary_list, correct_input_size, get_total_memory_used(x), formatting\n\u001b[1;32m 224\u001b[0m )\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/text-recognizer-ejNaVa9M-py3.9/lib/python3.9/site-packages/torchinfo/torchinfo.py:299\u001b[0m, in \u001b[0;36mforward_pass\u001b[0;34m(model, x, batch_dim, cache_forward_pass, device, mode, **kwargs)\u001b[0m\n\u001b[1;32m 297\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 298\u001b[0m executed_layers \u001b[38;5;241m=\u001b[39m [layer \u001b[38;5;28;01mfor\u001b[39;00m layer \u001b[38;5;129;01min\u001b[39;00m summary_list \u001b[38;5;28;01mif\u001b[39;00m layer\u001b[38;5;241m.\u001b[39mexecuted]\n\u001b[0;32m--> 299\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 300\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to run torchinfo. See above stack traces for more details. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 301\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExecuted layers up to: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexecuted_layers\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 302\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m hooks:\n", + "\u001b[0;31mRuntimeError\u001b[0m: Failed to run torchinfo. See above stack traces for more details. Executed layers up to: [EfficientNet: 1, Sequential: 2, ZeroPad2d: 3, Conv2d: 3, BatchNorm2d: 3, Mish: 3, MBConvBlock: 3, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, MBConvBlock: 3, InvertedBottleneck: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, Depthwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Mish: 6, SqueezeAndExcite: 4, Sequential: 5, Conv2d: 6, Mish: 6, Conv2d: 6, Pointwise: 4, Sequential: 5, Conv2d: 6, BatchNorm2d: 6, Sequential: 2, Conv2d: 3, BatchNorm2d: 3, Dropout: 3, Conv2d: 1]" + ] + } + ], + "source": [ "summary(net, ((1, 1, 576, 640), (1, 682)), device=\"cpu\", depth=4)" ] }, { "cell_type": "code", "execution_count": null, - "id": "506f01a3", + "id": "248a0cb1", "metadata": {}, "outputs": [], "source": [] |