前言
模型中常见的可训练层包括卷积层和线性层,这里将给出计算公式并在pytorch下进行验证。
计算模型的参数:
import torch.nn as nndef cal_params(model: nn.Module):num_learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)num_non_learnable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)return num_learnable_params, num_non_learnable_params
卷积层
其中,\(k_h,\ k_w\)表示卷积核的高和宽,\(c_\text{in},\ c_\text{out}\)表示输入和输出的通道数,\(+1\)表示偏置项。
举例:对于一个卷积层,输入通道数为3,输出通道数为64,卷积核大小为\(3\times3\),则参数量为:
测试
conv = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3)
print(cal_params(conv))
# (1792, 0)
线性层
线性层:\(y = Wx+b\),参数在权重\(W\)和偏置\(b\)中,参数量为:
举例:对于一个线性层,输入维度为64,输出维度为10,则参数量为:
测试
linear = nn.Linear(in_features=64, out_features=10)
print(cal_params(linear))
# (650, 0)
Transformer的参数量计算
Encoder layer
结构图中,Encoder的每一层参数量由Multi-head attention层,Feed forward层和两层LayerNorm层组成,设特征的维度为\(d_\text{model}=d\)
Multi-head attention
共有四个线性层\(W_Q,\ W_K,\ W_V,\ W_O\),参数量为:$$n_\text{atten}=(d + 1) \times d \times 4=4d^2 + 4d$$
Feed forward
由两个线性层组成,中间隐藏层的维度为\(d_\text{ff}\),参数量为:\((d + 1) \times d_\text{ff} + (d_\text{ff} + 1) \times d\),论文中,\(d_\text{ff}=4d\),参数量为:
LayerNorm
两个LayerNorm层,每个LayerNorm层有两个参数\(\gamma,\ \beta\),参数量为:\(n_\text{ln}=2d\)
整个Encoder layer的参数量为:
举例:\(d_\text{model}=512\)时,Encoder layer的参数量为:
Encoder layer的实现:
class AddAndNorm(nn.Module):def __init__(self, d_model, dropout_prob, eps=1e-6):super().__init__()self.w = nn.Parameter(torch.ones(d_model))self.b = nn.Parameter(torch.zeros(d_model))self.eps = epsself.dropout = nn.Dropout(p=dropout_prob)def norm(self, x):mean = x.mean(-1, keepdim=True)std = x.std(-1, keepdim=True)return self.w * (x - mean) / (std + self.eps) + self.bdef forward(self, x, sublayer):return x + self.dropout(sublayer(self.norm(x)))def attention(q, k, v, mask=None, dropout=None):d_k = q.size(-1)scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)if mask is not None:masking_value = -1e9 if scores.dtype == torch.float32 else -1e4scores = scores.masked_fill(mask == 0, masking_value)p_attn = scores.softmax(dim=-1)if dropout is not None:p_attn = dropout(p_attn)return torch.matmul(p_attn, v), p_attnclass MultiHeadedAttention(nn.Module):def __init__(self, h, d_model, dropout_prob=0.1):super().__init__()assert d_model % h == 0self.d_k = d_model // hself.h = hself.w_q = nn.Linear(d_model, d_model)self.w_k = nn.Linear(d_model, d_model)self.w_v = nn.Linear(d_model, d_model)self.w_o = nn.Linear(d_model, d_model)self.attn = Noneself.dropout = nn.Dropout(p=dropout_prob)def forward(self, q, k, v, mask=None):if mask is not None:mask = mask.unsqueeze(1) # 相同的mask应用于所有的注意力头hbatch_size = q.size(0)# 1) 执行线性变换,将 d_model 维度的 x 分割成 h 个 d_k 维度q = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)k = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)v = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)# 2) 计算注意力x, self.attn = attention(q, k, v, mask=mask, dropout=self.dropout)# 3) 通过线性层连接多头注意力计算完的向量x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)return self.w_o(x)class FeedForward(nn.Module):def __init__(self, d_model, d_ff, dropout_prob=0.1):super().__init__()self.w_1 = nn.Linear(d_model, d_ff)self.w_2 = nn.Linear(d_ff, d_model)self.dropout = nn.Dropout(p=dropout_prob)def forward(self, x):return self.w_2(self.dropout(self.w_1(x).relu()))class EncoderLayer(nn.Module):def __init__(self, d_model, h, d_ff, dropout_prob):super().__init__()self.attn = MultiHeadedAttention(h, d_model, dropout_prob)self.ff = FeedForward(d_model, d_ff, dropout_prob)self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(2)])def forward(self, x, mask):x = self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))return self.sublayer[1](x, self.ff)d_model = 512
d_ff = 4 * d_model
h = 8
dropout_prob = 0.1
encoder_layer = EncoderLayer(d_model, h, d_ff, dropout_prob)
print(cal_params(encoder_layer))
# (3152384, 0)
注意:在《Attention is All You Need》中 Add & Norm层的实现是:$${\rm LayerNorm}(x + {\rm Sublayer}(x))$$但可以看到,我们的代码实现方式为$$x+{\rm Sublayer(LayerNorm(x))}$$这在网上亦有讨论[Discussion],总的来说,后者的实现方式更加稳定,在许多实践中以后者为主。
Decoder layer
对比结构,decoder 每层的可训练参数相较于encoder layer多了一个Multi-head attention层和一个LayerNorm层,参数量为:
举例:\(d_\text{model}=512\)时,Decoder layer的参数量为:
测试:
class DecoderLayer(nn.Module):def __init__(self, d_model, h, d_ff, dropout_prob):super().__init__()self.self_attn = MultiHeadedAttention(h, d_model, dropout_prob)self.src_attn = MultiHeadedAttention(h, d_model, dropout_prob)self.ff = FeedForward(d_model, d_ff, dropout_prob)self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(3)])def forward(self, x, memory, src_mask, tgt_mask):x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))return self.sublayer[2](x, self.ff)d_model = 512
d_ff = 4 * d_model
h = 8
dropout_prob = 0.1
decoder_layer = DecoderLayer(d_model, h, d_ff, dropout_prob)
print(cal_params(decoder_layer))
# (4204032, 0)
Embedding层和生成层
Encoder和decoder分别使用一个embedding层,设encoder处理的词表大小为\(n_\text{src vocab}\),decoder处理的词表大小为\(n_\text{tgt vocab}\),特征维度为\(d_\text{model}=d\)。Encoder的embedding层参数量为:
Decoder的embedding层参数量为:
对于生成层,由于是一个线性层,参数量为:\(n_\text{gen}=(d + 1) \times n_\text{tgt vocab}\)。此外位置编码和残差连接没有可训练的参数。
举例:\(d_\text{model}=512\),\(n_\text{src vocab}=n_\text{tgt vocab}=10000\)时,Embedding层和生成层的参数量为:$$n_\text{src emb}=n_\text{tgt emb}=512 \times 10000=5,120,000$$
测试
class Embeddings(nn.Module):def __init__(self, d_model, vocab):super().__init__()self.lut = nn.Embedding(num_embeddings=vocab, embedding_dim=d_model)self.d_model = d_modeldef forward(self, x):return self.lut(x) * math.sqrt(self.d_model)d_model = 512
d_ff = 4 * d_model
h = 8
dropout_prob = 0.1
vocal_size = 10000
embeddings = Embeddings(d_model, vocal_size)
print(cal_params(embeddings))
# (5120000, 0)
设encoder和decoder的层数为\(l\),则整个transformer的参数量为:
举例:\(d_\text{model}=512,\ n_\text{src vocab}=n_\text{tgt vocab}=10000,\ l=6\)时,整个transformer的参数量为:
测试
class Transformer(nn.Module):def __init__(self, src_vocab, tgt_vocab, d_model, d_ff, h, dropout_prob, layers):super().__init__()self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab), PositionalEncoding(d_model, dropout_prob))self.tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab), PositionalEncoding(d_model, dropout_prob))self.encoder = nn.ModuleList([EncoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])self.decoder = nn.ModuleList([DecoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])self.generator = nn.Linear(d_model, tgt_vocab)def forward(self, src, tgt, src_mask, tgt_mask):memory = self.encode(src, src_mask)return self.decode(memory, src_mask, tgt, tgt_mask)def encode(self, src, mask):x = self.src_embed(src)for layer in self.encoder:x = layer(x, mask)return xdef decode(self, memory, src_mask, tgt, tgt_mask):x = self.tgt_embed(tgt)for layer in self.decoder:x = layer(x, memory, src_mask, tgt_mask)return self.generator(x)d_model = 512
d_ff = 4 * d_model
h = 8
layer = 6
dropout_prob = 0.1
vocal_size = 10000
model = Transformer(vocal_size, vocal_size, d_model, d_ff, h, dropout_prob, layer)
print(cal_params(model))
# (59508496, 0)
运行环境
Package Version
------------------------- -----------
torch 2.5.0
完整的代码:
import torch
import mathfrom torch import nndef cal_params(model: nn.Module):num_learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)num_non_learnable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)return num_learnable_params, num_non_learnable_paramsclass AddAndNorm(nn.Module):def __init__(self, d_model, dropout_prob, eps=1e-6):super().__init__()self.w = nn.Parameter(torch.ones(d_model))self.b = nn.Parameter(torch.zeros(d_model))self.eps = epsself.dropout = nn.Dropout(p=dropout_prob)def norm(self, x):mean = x.mean(-1, keepdim=True)std = x.std(-1, keepdim=True)return self.w * (x - mean) / (std + self.eps) + self.bdef forward(self, x, sublayer):return x + self.dropout(sublayer(self.norm(x)))def attention(q, k, v, mask=None, dropout=None):d_k = q.size(-1)scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)if mask is not None:masking_value = -1e9 if scores.dtype == torch.float32 else -1e4scores = scores.masked_fill(mask == 0, masking_value)p_attn = scores.softmax(dim=-1)if dropout is not None:p_attn = dropout(p_attn)return torch.matmul(p_attn, v), p_attnclass MultiHeadedAttention(nn.Module):def __init__(self, h, d_model, dropout_prob=0.1):super().__init__()assert d_model % h == 0self.d_k = d_model // hself.h = hself.w_q = nn.Linear(d_model, d_model)self.w_k = nn.Linear(d_model, d_model)self.w_v = nn.Linear(d_model, d_model)self.w_o = nn.Linear(d_model, d_model)self.attn = Noneself.dropout = nn.Dropout(p=dropout_prob)def forward(self, q, k, v, mask=None):if mask is not None:mask = mask.unsqueeze(1) # 相同的mask应用于所有的注意力头hbatch_size = q.size(0)# 1) 执行线性变换,将 d_model 维度的 x 分割成 h 个 d_k 维度q = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)k = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)v = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)# 2) 计算注意力x, self.attn = attention(q, k, v, mask=mask, dropout=self.dropout)# 3) 通过线性层连接多头注意力计算完的向量x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)return self.w_o(x)class FeedForward(nn.Module):def __init__(self, d_model, d_ff, dropout_prob=0.1):super().__init__()self.w_1 = nn.Linear(d_model, d_ff)self.w_2 = nn.Linear(d_ff, d_model)self.dropout = nn.Dropout(p=dropout_prob)def forward(self, x):return self.w_2(self.dropout(self.w_1(x).relu()))class EncoderLayer(nn.Module):def __init__(self, d_model, h, d_ff, dropout_prob):super().__init__()self.attn = MultiHeadedAttention(h, d_model, dropout_prob)self.ff = FeedForward(d_model, d_ff, dropout_prob)self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(2)])def forward(self, x, mask):x = self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))return self.sublayer[1](x, self.ff)class DecoderLayer(nn.Module):def __init__(self, d_model, h, d_ff, dropout_prob):super().__init__()self.self_attn = MultiHeadedAttention(h, d_model, dropout_prob)self.src_attn = MultiHeadedAttention(h, d_model, dropout_prob)self.ff = FeedForward(d_model, d_ff, dropout_prob)self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(3)])def forward(self, x, memory, src_mask, tgt_mask):x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))return self.sublayer[2](x, self.ff)class Embeddings(nn.Module):def __init__(self, d_model, vocab):super().__init__()self.lut = nn.Embedding(num_embeddings=vocab, embedding_dim=d_model)self.d_model = d_modeldef forward(self, x):return self.lut(x) * math.sqrt(self.d_model)class PositionalEncoding(nn.Module):def __init__(self, d_model, dropout_prob, max_len=5000):super().__init__()self.dropout = nn.Dropout(p=dropout_prob)# 计算位置编码pe = torch.zeros(max_len, d_model) # Shape: max_len x d_modelposition = torch.arange(0, max_len).unsqueeze(1) # Shape: max_len x 1div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000) / d_model))res = position * div_term # Shape: max_len x d_model/2pe[:, 0::2] = torch.sin(res)pe[:, 1::2] = torch.cos(res)pe = pe.unsqueeze(0) # Shape: 1 x max_len x d_modelself.register_buffer('pe', pe)def forward(self, x):x = x + self.pe[:, :x.size(1)].requires_grad_(False)return self.dropout(x)class Transformer(nn.Module):def __init__(self, src_vocab, tgt_vocab, d_model, d_ff, h, dropout_prob, layers):super().__init__()self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab), PositionalEncoding(d_model, dropout_prob))self.tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab), PositionalEncoding(d_model, dropout_prob))self.encoder = nn.ModuleList([EncoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])self.decoder = nn.ModuleList([DecoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])self.generator = nn.Linear(d_model, tgt_vocab)def forward(self, src, tgt, src_mask, tgt_mask):memory = self.encode(src, src_mask)return self.decode(memory, src_mask, tgt, tgt_mask)def encode(self, src, mask):x = self.src_embed(src)for layer in self.encoder:x = layer(x, mask)return xdef decode(self, memory, src_mask, tgt, tgt_mask):x = self.tgt_embed(tgt)for layer in self.decoder:x = layer(x, memory, src_mask, tgt_mask)return self.generator(x)if __name__ == '__main__':d_model = 512d_ff = 4 * d_modelh = 8layer = 6dropout_prob = 0.1vocal_size = 10000model = Transformer(vocal_size, vocal_size, d_model, d_ff, h, dropout_prob, layer)print(cal_params(model))
参考文献
- Pytorch 查看PyTorch模型中的总参数数量