2017年,Google Brain团队发表的《Attention is All You Need》论文彻底改变了自然语言处理领域的格局。作为深度学习领域最重要的基础架构之一,Transformer不仅成为了BERT、GPT等大模型的基石,也在计算机视觉、语音识别等领域展现出强大的通用性。本文将带您从零开始完整复现Transformer模型,深入解析每个核心组件的实现细节。
在NLP任务中,Transformer完全摒弃了传统的RNN结构,转而采用纯注意力机制来处理序列数据。这种架构具有三大核心优势:1) 并行计算能力大幅提升训练效率;2) 自注意力机制能够直接建模任意距离的依赖关系;3) 模块化设计使得模型易于扩展。理解Transformer的实现原理,是掌握现代深度学习技术栈的必经之路。
词嵌入(Word Embedding)是NLP模型的第一道处理工序,其本质是将离散的词汇符号映射到连续的向量空间。在Transformer中,词嵌入层需要完成以下维度变换:
输入维度:[batch_size, seq_len]
输出维度:[batch_size, seq_len, d_model]
这里的d_model代表嵌入维度(论文默认512),每个词汇会被映射为一个d_model维的向量。具体实现中,我们使用PyTorch的nn.Embedding模块:
python复制class Embeddings(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.lut = nn.Embedding(vocab_size, d_model)
self.d_model = d_model
def forward(self, x):
return self.lut(x) * math.sqrt(self.d_model)
关键细节说明:
vocab_size决定词表容量,需要覆盖所有可能出现的词汇由于Transformer抛弃了RNN的时序结构,必须显式地注入位置信息。论文采用正弦/余弦函数的位置编码方案:
公式:
PE(pos,2i) = sin(pos/10000^(2i/d_model))
PE(pos,2i+1) = cos(pos/10000^(2i/d_model))
实现代码:
python复制class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout, max_len=5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
技术细节剖析:
div_term的数学推导:
实际应用中发现,当序列长度远小于max_len时,可以适当减小max_len值以节省内存。但对于需要处理长文档的任务,建议保持5000或更大的值。
这是Transformer中最核心的运算单元,公式表示为:
Attention(Q,K,V) = softmax(QK^T/√d_k)V
代码实现:
python复制class ScaledDotProductAttention(nn.Module):
def __init__(self):
super().__init__()
self.softmax = nn.Softmax(dim=-1)
def forward(self, q, k, v, mask=None):
d_k = q.size(-1)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
p_attn = self.softmax(scores)
return torch.matmul(p_attn, v), p_attn
关键点解析:
多头机制允许模型在不同表示子空间学习特征:
python复制class MultiHeadAttention(nn.Module):
def __init__(self, h, d_model, dropout=0.1):
super().__init__()
assert d_model % h == 0
self.d_k = d_model // h
self.h = h
self.linears = clones(nn.Linear(d_model, d_model), 4)
self.attn = None
self.dropout = nn.Dropout(p=dropout)
def forward(self, query, key, value, mask=None):
if mask is not None:
mask = mask.unsqueeze(1)
nbatches = query.size(0)
# 1) 线性投影并分头
query, key, value = [
l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
for l, x in zip(self.linears, (query, key, value))
]
# 2) 计算注意力
x, self.attn = attention(query, key, value, mask=mask,
dropout=self.dropout)
# 3) 合并多头结果
x = x.transpose(1, 2).contiguous() \
.view(nbatches, -1, self.h * self.d_k)
return self.linears[-1](x)
实现技巧:
FFN由两个线性变换和ReLU激活组成:
python复制class PositionwiseFeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super().__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
return self.w_2(self.dropout(F.relu(self.w_1(x))))
参数说明:
Transformer使用残差连接缓解梯度消失:
python复制class SublayerConnection(nn.Module):
def __init__(self, size, dropout):
super().__init__()
self.norm = nn.LayerNorm(size)
self.dropout = nn.Dropout(dropout)
def forward(self, x, sublayer):
"残差连接后接层归一化"
return x + self.dropout(sublayer(self.norm(x)))
注意事项:
python复制class EncoderLayer(nn.Module):
def __init__(self, size, self_attn, feed_forward, dropout):
super().__init__()
self.self_attn = self_attn
self.feed_forward = feed_forward
self.sublayer = clones(SublayerConnection(size, dropout), 2)
self.size = size
def forward(self, x, mask):
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
return self.sublayer[1](x, self.feed_forward)
python复制class DecoderLayer(nn.Module):
def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
super().__init__()
self.size = size
self.self_attn = self_attn
self.src_attn = src_attn
self.feed_forward = feed_forward
self.sublayer = clones(SublayerConnection(size, dropout), 3)
def forward(self, x, memory, src_mask, tgt_mask):
m = memory
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
return self.sublayer[2](x, self.feed_forward)
python复制class Transformer(nn.Module):
def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.src_embed = src_embed
self.tgt_embed = tgt_embed
self.generator = generator
def encode(self, src, src_mask):
return self.encoder(self.src_embed(src), src_mask)
def decode(self, memory, src_mask, tgt, tgt_mask):
return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
def forward(self, src, tgt, src_mask, tgt_mask):
return self.decode(self.encode(src, src_mask), src_mask,
tgt, tgt_mask)
python复制def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
c = copy.deepcopy
attn = MultiHeadAttention(h, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
position = PositionalEncoding(d_model, dropout)
model = Transformer(
Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
Generator(d_model, tgt_vocab))
# 参数初始化
for p in model.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
return model
python复制def test():
model = make_model(10000, 10000)
src = torch.randint(0, 10000, (32, 10))
tgt = torch.randint(0, 10000, (32, 20))
src_mask = (src != 0).unsqueeze(-2)
tgt_mask = (tgt != 0).unsqueeze(-2)
out = model(src, tgt[:, :-1], src_mask, tgt_mask[:, :-1, :-1])
print(out.shape) # 应输出 torch.Size([32, 19, 10000])
训练技巧:
维度不匹配错误
注意力分数溢出
训练不稳定
内存优化:
计算加速:
模型压缩:
在实际应用中,Transformer的实现还需要考虑分布式训练、批处理优化、推理加速等诸多工程问题。理解这个基础实现版本后,可以进一步研究各种变体和改进方案。