JustPaste.it
This is the file I used for H100 NVL in full precision
```
from unsloth import FastLanguageModel
import torch
max_seq_length = 32000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
 
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
] # More models at https://huggingface.co/unsloth
 
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-32B-Instruct", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
 
 
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
 
 
def formatting_prompts_func(examples):
    convos = []
    # print(examples)
    for i in range(len(examples["input"])):
        msg = [{"role":"system","content":examples["instruction"][i]},
                {"role":"user","content":examples["input"][i]},
                {"role":"assistant","content":examples["output"][i]}]
        convos.append(msg)
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).strip() for convo in convos]
    return { "text" : texts, }
 
 
from datasets import load_dataset
train_dataset = load_dataset("csv", data_files="train_dataset_v1.csv",split = "train")
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
 
eval_dataset = load_dataset("csv", data_files="eval_dataset_v1.csv",split="train")
eval_dataset = eval_dataset.map(formatting_prompts_func, batched = True)
 
 
 
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
 
response_template_string = "\n<|im_start|>assistant\n"
response_template_ids = tokenizer.encode(response_template_string, add_special_tokens=False)[1:]
        
data_collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template_ids,
    tokenizer=tokenizer,
)
 
 
from trl import SFTTrainer
from transformers import TrainingArguments
 
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,
    data_collator=data_collator,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 16,
        warmup_steps = 5,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to="wandb",
        fp16_full_eval = True,
        per_device_eval_batch_size = 1,
        eval_accumulation_steps = 1,
        eval_strategy = "steps",
        eval_steps = 100,
        output_dir = "outputs",
        save_strategy = "steps",
        save_steps = 100,
        run_name="trial",
 
    ),
)
 
```