diff options
-rw-r--r-- | text_recognizer/networks/conformer/block.py | 4 | ||||
-rw-r--r-- | text_recognizer/networks/conformer/conformer.py | 5 | ||||
-rw-r--r-- | text_recognizer/networks/conformer/subsampler.py | 8 |
3 files changed, 9 insertions, 8 deletions
diff --git a/text_recognizer/networks/conformer/block.py b/text_recognizer/networks/conformer/block.py index 4ea33c0..c53f339 100644 --- a/text_recognizer/networks/conformer/block.py +++ b/text_recognizer/networks/conformer/block.py @@ -26,9 +26,9 @@ class ConformerBlock(nn.Module): self.conv = conv self.post_norm = nn.LayerNorm(dim) - def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor: + def forward(self, x: Tensor) -> Tensor: x = self.ff_1(x) + x - x = self.attn(x, input_mask=mask) + x + x = self.attn(x) + x x = self.conv(x) + x x = self.ff_2(x) + x return self.post_norm(x) diff --git a/text_recognizer/networks/conformer/conformer.py b/text_recognizer/networks/conformer/conformer.py index 8d0e98e..e2dce27 100644 --- a/text_recognizer/networks/conformer/conformer.py +++ b/text_recognizer/networks/conformer/conformer.py @@ -10,6 +10,8 @@ from text_recognizer.networks.conformer.block import ConformerBlock class Conformer(nn.Module): def __init__( self, + dim: int, + num_classes: int, subsampler: Type[nn.Module], block: ConformerBlock, depth: int, @@ -17,9 +19,10 @@ class Conformer(nn.Module): super().__init__() self.subsampler = subsampler self.blocks = nn.ModuleList([deepcopy(block) for _ in range(depth)]) + self.fc = nn.Linear(dim, num_classes, bias=False) def forward(self, x: Tensor) -> Tensor: x = self.subsampler(x) for fn in self.blocks: x = fn(x) - return x + return self.fc(x) diff --git a/text_recognizer/networks/conformer/subsampler.py b/text_recognizer/networks/conformer/subsampler.py index 2bc0445..53928f1 100644 --- a/text_recognizer/networks/conformer/subsampler.py +++ b/text_recognizer/networks/conformer/subsampler.py @@ -34,13 +34,11 @@ class Subsampler(nn.Module): ) ) subsampler.append(nn.Mish(inplace=True)) - projector = nn.Sequential( - nn.Flatten(start_dim=2), nn.Linear(channels, channels), nn.Dropout(dropout) - ) + projector = nn.Sequential(nn.Linear(channels, channels), nn.Dropout(dropout)) return nn.Sequential(*subsampler), projector def forward(self, x: Tensor) -> Tensor: x = self.subsampler(x) x = self.pixel_pos_embedding(x) - x = self.projector(x) - return x.permute(0, 2, 1) + x = x.flatten(start_dim=2).permute(0, 2, 1) + return self.projector(x) |