# 导入环境

In [None]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

In [None]:
df = pd.read_json('train.json')
ds = Dataset.from_pandas(df)

In [None]:
ds[:3]

# 下载模型

In [None]:
from modelscope import snapshot_download
model_dir = snapshot_download('qwen/Qwen2-0.5B-Instruct', cache_dir='qwen2-0.5b/')

# 处理数据集

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./qwen2-0.5b/qwen/Qwen2-0___5B-Instruct/', use_fast=False, trust_remote_code=True)
tokenizer

In [None]:
def process_func(example):
 MAX_LENGTH = 384 # Llama分词器会将一个中文字切分为多个token,因此需要放开一些最大长度,保证数据的完整性
 input_ids, attention_mask, labels = [], [], []
 instruction = tokenizer(f"<|im_start|>system\n现在你需要扮演我,和我的微信好友快乐聊天!<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)
 response = tokenizer(f"{example['output']}", add_special_tokens=False)
 input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
 attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] # 因为eos token咱们也是要关注的所以 补充为1
 labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id] 
 if len(input_ids) > MAX_LENGTH: # 做一个截断
 input_ids = input_ids[:MAX_LENGTH]
 attention_mask = attention_mask[:MAX_LENGTH]
 labels = labels[:MAX_LENGTH]
 return {
 "input_ids": input_ids,
 "attention_mask": attention_mask,
 "labels": labels
 }

In [None]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

In [None]:
tokenizer.decode(tokenized_id[0]['input_ids'])

In [None]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"])))

# 创建模型

In [None]:
import torch

model = AutoModelForCausalLM.from_pretrained('./qwen2-0.5b/qwen/Qwen2-0___5B-Instruct', device_map="auto",torch_dtype=torch.bfloat16)
model

In [None]:
model.enable_input_require_grads()

In [None]:
model.dtype

# lora 

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
 task_type=TaskType.CAUSAL_LM, 
 target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
 inference_mode=False, # 训练模式
 r=8, # Lora 秩
 lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理
 lora_dropout=0.1# Dropout 比例
)
config

In [None]:
model = get_peft_model(model, config)
config

In [None]:
model.print_trainable_parameters()

# 配置训练参数

In [None]:
args = TrainingArguments(
 output_dir="./output/",
 per_device_train_batch_size=4,
 gradient_accumulation_steps=4,
 logging_steps=10,
 num_train_epochs=3,
 learning_rate=1e-4,
 gradient_checkpointing=True
)

In [None]:
trainer = Trainer(
 model=model,
 args=args,
 train_dataset=tokenized_id,
 data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [None]:
trainer.train()

# 合并加载模型,这里的路径可能有点不太一样,lora_path填写为Output的最后的checkpoint

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = './qwen2-0.5b/qwen/Qwen2-0___5B-Instruct'
lora_path = './output/checkpoint-10' #修改这里
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

prompt = "在干啥呢?"
inputs = tokenizer.apply_chat_template([{"role": "user", "content": "现在你需要扮演我,和我的微信好友快乐聊天!"},{"role": "user", "content": prompt}],
 add_generation_prompt=True,
 tokenize=True,
 return_tensors="pt",
 return_dict=True
 ).to('cuda')


gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
 outputs = model.generate(**inputs, **gen_kwargs)
 outputs = outputs[:, inputs['input_ids'].shape[1]:]
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# 保存合并后的模型和tokenizer
save_directory = './model_merge'

# 保存模型

model.save_pretrained(save_directory)

# 保存tokenizer
tokenizer.save_pretrained(save_directory)

# 然后把模型上传到modelscope开始下一步