1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
"""Efficient self attention."""
from einops import rearrange
import torch
import torch.nn.functional as F
from torch import einsum, nn, Tensor
class LayerNorm(nn.Module):
def __init__(self, dim: int) -> None:
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x: Tensor) -> Tensor:
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class SwiGLU(nn.Module):
def forward(self, x: Tensor) -> Tensor:
x, gate = x.chunk(2, dim=-1)
return F.silu(gate) * x
class Attention(nn.Module):
def __init__(
self, dim: int, dim_head: int = 64, heads: int = 8, mult: int = 4
) -> None:
super().__init__()
self.norm = LayerNorm(dim)
attn_inner_dim = heads * dim_head
ff_inner_dim = mult * dim
self.heads = heads
self.scale = dim_head ** -0.5
self.fused_dims = (attn_inner_dim, dim_head, dim_head, (2 * ff_inner_dim))
self.fused_attn_ff_proj = nn.Linear(dim, sum(self.fused_dims), bias=False)
self.attn_out = nn.Linear(attn_inner_dim, dim, bias=False)
self.ff_out = nn.Sequential(SwiGLU(), nn.Linear(ff_inner_dim, dim, bias=False))
def forward(self, x: Tensor) -> Tensor:
h = self.heads
x = self.norm(x)
q, k, v, ff = self.fused_attn_ff_proj(x).split(self.fused_dims, dim=-1)
q = rearrange(q, "b n (h d) -> b h n d", h=h)
q = q * self.scale
sim = einsum("b h i d, b j d -> b h i j", q, k)
attn = sim.softmax(dim=-1)
out = einsum("b h i j, b j d -> b h i d", attn, v)
out = rearrange(out, "b h n d -> b n (h d)")
return self.attn_out(out) + self.ff_out(ff)
|