千问Qwen2大语言模型推理

Halcom · 发表于 2025-9-30 20:43:13

Qwen2-1.5B-Instruct，在huggingface下载：https://hf-mirror.com/Qwen/Qwen2.5-1.5B-Instruct/tree/main
Qwen1.5 - a Qwen Collection
transformers代码在以下路径可追根，
D:\Users\WITIAI\anaconda3\Lib\site-packages\transformers\generation\utils.py
D:\Users\WITIAI\anaconda3\Lib\site-packages\transformers\models\qwen2\modeling_qwen2.py
千问Qwen2大语言模型推理

from dataclasses import dataclass, field
from typing import Optional
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel
import numpy as np
import random
@dataclass
class DecodeArguments:
llm_type: str = 'qwen2'
decode_type: str = 'llm'
max_new_tokens: int = 50
num_beams: int = 1
batch_size: int = 1
result_path: str = "result.txt"
@dataclass
class ModelArguments:
llm_model_name_or_path: Optional[str] = "D:/2-LearningCode/913_LM/wesr-main/Qwen2-1.5B-Instruct"
projector_hidden_size: int = 2048
projector_model_path: Optional[str] = field(default=None)
class ProjectorConv1d(nn.Module):
def __init__(self, config, encoder_dim, llm_dim):
super().__init__()
self.k = config.encoder_projector_ds_rate
self.conv1d = nn.Conv1d(in_channels=encoder_dim,
out_channels=encoder_dim,
kernel_size=self.k,
stride=self.k,
padding=0)
self.linear1 = nn.Linear(encoder_dim, config.projector_hidden_size)
self.relu1 = nn.ReLU()
self.linear2 = nn.Linear(config.projector_hidden_size, llm_dim)
self.relu2 = nn.ReLU()
def forward(self, x):
x = x.transpose(1, 2)
x = self.conv1d(x)
x = x.transpose(1, 2)
x = self.relu1(x)
x = self.linear1(x)
x = self.relu2(x)
x = self.linear2(x)
return x
def freeze_model(model):
for _, param in model.named_parameters():
param.requires_grad = False
class SpeechLLM(PreTrainedModel):
supports_gradient_checkpointing = True
def __init__(
self,
llm: nn.Module,
config,
model_args: ModelArguments,
):
super().__init__(config)
self.llm = llm
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
def generate(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
eos_token_id=None,
decode_config=None,
):
text_emb = self.llm.get_input_embeddings()(input_ids)
model_outputs = self.llm.generate(
inputs_embeds=text_emb,
attention_mask=attention_mask,
do_sample=False,
top_p=1.0,
num_beams=decode_config.num_beams,
max_new_tokens=decode_config.max_new_tokens,
eos_token_id=eos_token_id,
)
return model_outputs
def init_model(model_args):
# Load llm model and tokenizer
config = transformers.AutoConfig.from_pretrained(
model_args.llm_model_name_or_path)
config.use_cache = False
llm_model = AutoModelForCausalLM.from_pretrained(
model_args.llm_model_name_or_path,
config=config,
torch_dtype='auto',
)
model = SpeechLLM(llm_model, config, model_args)
if model_args.projector_model_path is not None:
model.load_projector(model_args.projector_model_path)
return model
def set_seed(seed=42):
# 设置PyTorch的随机种子
torch.manual_seed(seed)
# 如果使用GPU，还需要设置CUDA的随机种子
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # 多GPU时使用
# 确保每次返回的卷积算法是确定的
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# 设置numpy的随机种子
np.random.seed(seed)
# 设置Python的随机种子
random.seed(seed)
def main():
set_seed()
model_args = ModelArguments()
decode_args = DecodeArguments()
model = init_model(model_args)
tokenizer = AutoTokenizer.from_pretrained(model_args.llm_model_name_or_path)
text = "halcom.cn, welcome to halcom.cn"
ids = tokenizer.encode(text)
print("Encoded:", ids)
print("Decoded:", tokenizer.decode(ids))
if decode_args.llm_type == 'qwen2':
eos_token_id = tokenizer.convert_tokens_to_ids(['<|endoftext|>', '<|im_end|>'])
else:
tokenizer.pad_token = '<|finetune_right_pad_id|>'
eos_token_id = tokenizer.convert_tokens_to_ids(['<|end_of_text|>', '<|eot_id|>'])
device = torch.device('cuda:0')
if torch.cuda.is_available():
model = model.cuda()
else:
device = torch.device('cpu')
model.eval()
prompt = '编写一段代码'
vocab_size = max(eos_token_id) # Reduced to avoid vocab size error
max_seq_len = 50
input_ids = tokenizer.encode(prompt)
input_ids = [i for i in input_ids if i < vocab_size][:max_seq_len] # 修复：过滤非法 token id
input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
seq_len = input_tensor.size(1)
mask = torch.tril(torch.ones(1, seq_len)).to(device)
generated_ids = model.generate(input_tensor, mask,
eos_token_id=eos_token_id,
decode_config=decode_args)
# generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
generated_text = tokenizer.decode(generated_ids.squeeze(0), skip_special_tokens=True)
print(f'输入: {prompt}\n生成: {generated_text}')
if __name__ == "__main__":
main()

复制代码

Halcom · 发表于 2025-10-24 21:44:25

input_ids
Out [19]:
tensor([[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13,
151645, 198, 151644, 872, 198, 35127, 752, 264, 2805,
16800, 311, 3460, 4128, 1614, 13, 151645, 198, 151644,
77091, 198]], device='cuda:0')
attention_mask
Out [20]:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1]], device='cuda:0')
position_ids
Out [21]:
tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]], device='cuda:0')
past_key_values
inputs_embeds
use_cache
Out [24]: True
output_attentions
Out [25]: False
output_hidden_states
Out [26]: False
return_dict
Out [27]: True

复制代码

Halcom · 发表于 2025-10-24 23:48:45

采用，llama调用大模型，采用b5587这个吧，cuda11.7，适配当前电脑环境。https://github.com/ggml-org/llama.cpp/releases?page=88
transformer架构，没法直接pytorch的jit工具转化pt，提示类型不一致的问题。

亲测，可以用llama也可以用inferflow来调用。

		自动登录	找回密码
密码			立即注册