安装依赖
pip install transformers==4.32.0
pip install accelerate
pip install tiktoken
pip install einops
pip install transformers_stream_generator==0.0.4
pip install scipy
pip install auto-gptq optimum
使用
参见官方介绍
模型
模型结构
QwenBlock
打印模型
## 用了Qlora
PeftModelForCausalLM((base_model): LoraModel((model): QWenLMHeadModel((transformer): QWenModel((wte): Embedding(151936, 4096)(drop): Dropout(p=0.0, inplace=False)(h): ModuleList((0-31): 32 x QWenBlock((ln_1): RMSNorm()(attn): QWenAttention((c_attn): Linear4bit(in_features=4096, out_features=12288, bias=True(lora_dropout): ModuleDict((default): Dropout(p=0.1, inplace=False))(lora_A): ModuleDict((default): Linear(in_features=4096, out_features=64, bias=False))(lora_B): ModuleDict((default): Linear(in_features=64, out_features=12288, bias=False))(lora_embedding_A): ParameterDict()(lora_embedding_B): ParameterDict())(c_proj): Linear4bit(in_features=4096, out_features=4096, bias=False(lora_dropout): ModuleDict((default): Dropout(p=0.1, inplace=False))(lora_A): ModuleDict((default): Linear(in_features=4096, out_features=64, bias=False))(lora_B): ModuleDict((default): Linear(in_features=64, out_features=4096, bias=False))(lora_embedding_A): ParameterDict()(lora_embedding_B): ParameterDict())(rotary_emb): RotaryEmbedding()(attn_dropout): Dropout(p=0.0, inplace=False))(ln_2): RMSNorm()(mlp): QWenMLP((w1): Linear4bit(in_features=4096, out_features=11008, bias=False(lora_dropout): ModuleDict((default): Dropout(p=0.1, inplace=False))(lora_A): ModuleDict((default): Linear(in_features=4096, out_features=64, bias=False))(lora_B): ModuleDict((default): Linear(in_features=64, out_features=11008, bias=False))(lora_embedding_A): ParameterDict()(lora_embedding_B): ParameterDict())(w2): Linear4bit(in_features=4096, out_features=11008, bias=False(lora_dropout): ModuleDict((default): Dropout(p=0.1, inplace=False))(lora_A): ModuleDict((default): Linear(in_features=4096, out_features=64, bias=False))(lora_B): ModuleDict((default): Linear(in_features=64, out_features=11008, bias=False))(lora_embedding_A): ParameterDict()(lora_embedding_B): ParameterDict())(c_proj): Linear4bit(in_features=11008, out_features=4096, bias=False(lora_dropout): ModuleDict((default): Dropout(p=0.1, inplace=False))(lora_A): ModuleDict((default): Linear(in_features=11008, out_features=64, bias=False))(lora_B): ModuleDict((default): Linear(in_features=64, out_features=4096, bias=False))(lora_embedding_A): ParameterDict()(lora_embedding_B): ParameterDict()))))(ln_f): RMSNorm())(lm_head): Linear(in_features=4096, out_features=151936, bias=False)))
)
head对应的数据类型
base_model.model.transformer.h.7.ln_1.weight torch.float32
base_model.model.transformer.h.7.attn.c_attn.weight torch.uint8
base_model.model.transformer.h.7.attn.c_attn.bias torch.bfloat16
base_model.model.transformer.h.7.attn.c_attn.lora_A.default.weight torch.bfloat16
base_model.model.transformer.h.7.attn.c_attn.lora_B.default.weighttorch.bfloat16
base_model.model.transformer.h.7.attn.c_proj.weight torch.uint8
base_model.model.transformer.h.7.attn.c_proj.lora_A.default.weight torch.bfloat16
base_model.model.transformer.h.7.attn.c_proj.lora_B.default.weight torch.bfloat16
base_model.model.transformer.h.7.ln_2.weight torch.float32
base_model.model.transformer.h.7.mlp.w1.weight torch.uint8
base_model.model.transformer.h.7.mlp.w1.lora_A.default.weight torch.bfloat16
base_model.model.transformer.h.7.mlp.w1.lora_B.default.weight torch.bfloat16
base_model.model.transformer.h.7.mlp.w2.weight torch.uint8
base_model.model.transformer.h.7.mlp.w2.lora_A.default.weight torch.bfloat16
base_model.model.transformer.h.7.mlp.w2.lora_B.default.weight torch.bfloat16
base_model.model.transformer.h.7.mlp.c_proj.weight torch.uint8
base_model.model.transformer.h.7.mlp.c_proj.lora_A.default.weight torch.bfloat16
base_model.model.transformer.h.7.mlp.c_proj.lora_B.default.weight torch.bfloat16
源码分析
QWenBlock
class QWenBlock(nn.Module):def __init__(self, config):super().__init__()hidden_size = config.hidden_sizeself.bf16 = config.bf16self.ln_1 = RMSNorm(hidden_size,eps=config.layer_norm_epsilon,)self.attn = QWenAttention(config)self.ln_2 = RMSNorm(hidden_size,eps=config.layer_norm_epsilon,)self.mlp = QWenMLP(config)def forward(self,hidden_states: Optional[Tuple[torch.FloatTensor]],rotary_pos_emb: Optional[List[torch.Tensor]] = None,registered_causal_mask: Optional[torch.Tensor] = None,layer_past: Optional[Tuple[torch.Tensor]] = None,attention_mask: Optional[torch.FloatTensor] = None,head_mask: Optional[torch.FloatTensor] = None,encoder_hidden_states: Optional[torch.Tensor] = None,encoder_attention_mask: Optional[torch.FloatTensor] = None,use_cache: Optional[bool] = False,output_attentions: Optional[bool] = False,):layernorm_output = self.ln_1(hidden_states)attn_outputs = self.attn(layernorm_output,rotary_pos_emb,registered_causal_mask=registered_causal_mask,layer_past=layer_past,attention_mask=attention_mask,head_mask=head_mask,use_cache=use_cache,output_attentions=output_attentions,)attn_output = attn_outputs[0]outputs = attn_outputs[1:]residual = hidden_stateslayernorm_input = attn_output + residuallayernorm_output = self.ln_2(layernorm_input)residual = layernorm_inputmlp_output = self.mlp(layernorm_output)hidden_states = residual + mlp_outputif use_cache:outputs = (hidden_states,) + outputselse:outputs = (hidden_states,) + outputs[1:]return outputs
forword函数的参数:
- hidden_states:一个可选的元组,包含了上一层的输出张量,形状为(batch_size, sequence_length, hidden_size)
- rotary_pos_emb:一个可选的列表,包含了旋转位置编码张量,形状为(batch_size, sequence_length, hidden_size)
- registered_causal_mask:一个可选的张量,用于注册因果掩码,防止模型看到未来的信息。形状为(batch_size, sequence_length, sequence_length)。
- layer_past:一个可选的元组,包含了上一层的注意力键值对张量,用于实现缓存机制,加速生成过程。形状为(2, batch_size, num_heads, sequence_length, head_dim)。
- attention_mask:一个可选的浮点张量,用于对输入序列进行掩码,忽略无效的位置或填充部分。形状为(batch_size, sequence_length)或(batch_size, 1, 1, sequence_length)。
- head_mask:一个可选的浮点张量,用于对注意力头进行掩码,随机删除一些头以增加模型的鲁棒性。形状为(num_heads,)或(1, 1, num_heads, 1)。
- encoder_hidden_states:一个可选的张量,用于实现编码器-解码器结构时,传递编码器的输出给解码器。形状为(batch_size, encoder_sequence_length, hidden_size)。
- encoder_attention_mask:一个可选的浮点张量,用于实现编码器-解码器结构时,对编码器输出进行掩码。形状为(batch_size, encoder_sequence_length)或(batch_size, 1, 1, encoder_sequence_length)。
- use_cache:一个可选的布尔值,用于指示是否使用缓存机制。
- output_attentions:一个可选的布尔值,用于指示是否输出注意力权重张量。
RMSNorm
参见RMSNorm介绍
QWenMLP
class QWenMLP(nn.Module):def __init__(self, config):super().__init__()self.w1 = nn.Linear(config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias)self.w2 = nn.Linear(config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias)ff_dim_in = config.intermediate_size // 2self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)def forward(self, hidden_states):a1 = self.w1(hidden_states)a2 = self.w2(hidden_states)intermediate_parallel = a1 * F.silu(a2)output = self.c_proj(intermediate_parallel)return output
QWenAttention
class QWenAttention(nn.Module):def __init__(self, config, layer_number=None):super().__init__()self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)self.seq_length = config.seq_lengthself.hidden_size = config.hidden_sizeself.split_size = config.hidden_size self.num_heads = config.num_attention_headsself.head_dim = self.hidden_size // self.num_headsself.use_flash_attn = config.use_flash_attnself.scale_attn_weights = True # scale_attn_weights:布尔标志,表示是否对注意力权重进行缩放self.projection_size = config.kv_channels * config.num_attention_heads # projection_siz:投影的大小assert self.projection_size % config.num_attention_heads == 0self.hidden_size_per_attention_head = (self.projection_size // config.num_attention_heads)self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)self.c_proj = nn.Linear(config.hidden_size, self.projection_size, bias=not config.no_bias)self.is_fp32 = not (config.bf16 or config.fp16)if (self.use_flash_attnand flash_attn_unpadded_func is not Noneand not self.is_fp32):self.core_attention_flash = FlashSelfAttention(causal=True, attention_dropout=config.attn_pdrop)self.bf16 = config.bf16self.use_dynamic_ntk = config.use_dynamic_ntkself.use_logn_attn = config.use_logn_attnlogn_list = [math.log(i, self.seq_length) if i > self.seq_length else 1for i in range(1, 32768)]logn_tensor = torch.tensor(logn_list)[None, :, None, None]self.register_buffer("logn_tensor", logn_tensor, persistent=False)self.attn_dropout = nn.Dropout(config.attn_dropout_prob)self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else Falseself.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else Falseself.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else Falsecache_dtype = torch.floatif self.bf16:cache_dtype=torch.bfloat16elif config.fp16:cache_dtype = torch.float16self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)if config.use_cache_quantization and config.use_cache_kernel:from .cpp_kernels import cache_autogptq_cuda_256try:self.cache_kernels = cache_autogptq_cuda_256except ImportError:self.cache_kernels = None
注意力计算
def _attn(self, query, key, value, attention_mask=None, head_mask=None):attn_weights = torch.matmul(query, key.transpose(-1, -2))if self.scale_attn_weights: # 对注意力权重进行缩放attn_weights = attn_weights / torch.full([],value.size(-1) ** 0.5,dtype=attn_weights.dtype,device=attn_weights.device,)query_length, key_length = query.size(-2), key.size(-2)causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]mask_value = torch.finfo(attn_weights.dtype).minmask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)attn_weights = nn.functional.softmax(attn_weights, dim=-1)attn_weights = attn_weights.type(value.dtype)attn_weights = self.attn_dropout(attn_weights)if head_mask is not None:attn_weights = attn_weights * head_maskattn_output = torch.matmul(attn_weights, value)attn_output = attn_output.transpose(1, 2)return attn_output, attn_weights
RotaryEmbedding
待补充
参考文献:
Qwen github
Qwen model