This is the file I used for H100 NVL in full precision
```
from unsloth import FastLanguageModel
import torch
max_seq_length = 32000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
"unsloth/mistral-7b-bnb-4bit",
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
"unsloth/llama-2-7b-bnb-4bit",
"unsloth/llama-2-13b-bnb-4bit",
"unsloth/codellama-34b-bnb-4bit",
"unsloth/tinyllama-bnb-4bit",
] # More models at https://huggingface.co/unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "Qwen/Qwen2.5-32B-Instruct", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = FastLanguageModel.get_peft_model(
model,
r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
use_gradient_checkpointing = "unsloth",
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
def formatting_prompts_func(examples):
convos = []
# print(examples)
for i in range(len(examples["input"])):
msg = [{"role":"system","content":examples["instruction"][i]},
{"role":"user","content":examples["input"][i]},
{"role":"assistant","content":examples["output"][i]}]
convos.append(msg)
texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).strip() for convo in convos]
return { "text" : texts, }
from datasets import load_dataset
train_dataset = load_dataset("csv", data_files="train_dataset_v1.csv",split = "train")
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
eval_dataset = load_dataset("csv", data_files="eval_dataset_v1.csv",split="train")
eval_dataset = eval_dataset.map(formatting_prompts_func, batched = True)
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
response_template_string = "\n<|im_start|>assistant\n"
response_template_ids = tokenizer.encode(response_template_string, add_special_tokens=False)[1:]
data_collator = DataCollatorForCompletionOnlyLM(
response_template=response_template_ids,
tokenizer=tokenizer,
)
from trl import SFTTrainer
from transformers import TrainingArguments
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = train_dataset,
eval_dataset = eval_dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 8,
data_collator=data_collator,
packing = False, # Can make training 5x faster for short sequences.
args = TrainingArguments(
per_device_train_batch_size = 1,
gradient_accumulation_steps = 16,
warmup_steps = 5,
num_train_epochs=1,
learning_rate = 2e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
report_to="wandb",
fp16_full_eval = True,
per_device_eval_batch_size = 1,
eval_accumulation_steps = 1,
eval_strategy = "steps",
eval_steps = 100,
output_dir = "outputs",
save_strategy = "steps",
save_steps = 100,
run_name="trial",
),
)
```