I’m pretty new to LLM fine-tuning, and have been working on a small personal project. I’m fine-tuning Meta LLaMA 3.1 8B Instruct using Hugging Face’s Trainer API with LoRA on a multi-GPU setup (6x L4 GPUs). My goal is to build a text-to-text model that includes a class class=0|1 and a description=… text, and I want to evaluate the model after each epoch using custom callbacks with metrics (classification + description scoring). My dataset is huge (~7M examples) so it’s important to run and use all my gpus.
I’ve tried following many different online examples and posts but could not find a fully suitable solution to all my needs. For example: – I used unsloth example here https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-Alpaca.ipynb and prepared my dataset properly. The code has been running fine for weeks now but it’s only using a single GPU for the fine-tuning. I looked into running the code with torchrun and accelerate but ran into issues like ValueError: You can’t train a model that has been loaded withdevice_map=’auto’in any distributed mode.. I looked into opensloth too but decided not to use it (honestly cannot remember why). – I used llama-factory which was really fast and used my multi-gpu setup, but since I was using the llamafactory-cli tool, that meant I could not pass a custom TrainerCallback to run the evaluation and calculate the custom metrics I needed after each epoch specially that it takes weeks to get the results back. – I tried using the run_exp function from the llama-factory repo by somehow bypassing the llamafactory-cli tool since that way I can pass the TrainerCallback but I faced problems tokenizing and converting my eval dataset to the proper layout (llama3 template) as required. – I tried again using raw Trainer class from Hugging Face with and without LoRA and with torchrun but kept either running OOM or getting errors like tensors do not require grad.
My dataset looks like following (I filled random text just to show how it might look): {« input »: « input text to classify and give description », « output »: « Class=0nDescription=… »}
Below is my latest code with raw Trainer class from Hugging Face « ` import os import torch import re import json from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, TrainerCallback ) from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix from tqdm import tqdm
import nltk import datetime from nltk.translate.bleu_score import sentence_bleu from rouge_score import rouge_scorer
def format_prompt(input_text): instruction = « Here is an example XYZ, classify the text into one of the classes A=…, B=…, C=… and give a short description why. » return ( « <|start_header_id|>user<|end_header_id|>n » f »{instruction}n{input_text.strip()}<|eot_id|>n » « <|start_header_id|>assistant<|end_header_id|>n » )
class CustomEvalCallback(TrainerCallback): def onepoch_end(self, args, state, control, **kwargs): trainer = kwargs[« trainer »] model = trainer.model tokenizer = trainer.tokenizer eval_dataset = trainer.eval_dataset epoch = int(state.epoch) now = datetime.datetime.now().strftime(« %Y%m%d%H%M%S »)
output_dir = os.path.join(args.output_dir, f »epoch_{epoch} ») os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir, safe_serialization=True) tokenizer.save_pretrained(output_dir) preds, refs, descs, pred_descs = [], [], [], [] raw_outputs = [] rouge = rouge_scorer.RougeScorer([‘rougeL’], use_stemmer=True) for i, example in enumerate(tqdm(eval_dataset, desc=f »Inference Epoch {epoch} »)): try: prompt = format_prompt(example[« input »]) inputs = tokenizer(prompt, return_tensors= »pt », truncation=True, max_length=2048).to(model.device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=100, do_sample=False, num_beams=1 ) decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True) output_ref = example[« output »] true_label = re.search(r »Class=s*([ABC]) », output_ref).group(1) pred_label_match = re.search(r »Class=s*([ABC]) », decoded) pred_label = pred_label_match.group(1) if pred_label_match else None desc_match = re.search(r »Description=s*(.*) », output_ref) pred_desc_match = re.search(r »Description=s*(.*) », decoded) desc = desc_match.group(1).strip() if desc_match else « » pred_desc = pred_desc_match.group(1).strip() if pred_desc_match else « » refs.append(true_label) preds.append(pred_label) descs.append(desc) pred_descs.append(pred_desc) raw_outputs.append({ « index »: i, « input »: example[« input »], « expected_output »: output_ref, « predicted_output »: decoded, « match »: pred_label == true_label if pred_label is not None else False, « label »: true_label, « pred_label »: pred_label, « desc »: desc, « pred_desc »: pred_desc, }) except Exception as e: print(f »[Warning] Skipping example {i}: {e} ») continue report = classification_report(refs, preds, output_dict=True, digits=4) acc = accuracy_score(refs, preds) prec = precision_score(refs, preds) rec = recall_score(refs, preds) f1 = f1_score(refs, preds) bleu_scores = [sentence_bleu([nltk.word_tokenize(r)], nltk.word_tokenize(p)) if p else 0.0 for r, p in zip(descs, pred_descs)] rouge_scores = [rouge.score(r, p)[‘rougeL’].fmeasure if p else 0.0 for r, p in zip(descs, pred_descs)] with open(os.path.join(output_dir, f »eval_outputs_{now}.jsonl »), « w ») as f: for line in raw_outputs: f.write(json.dumps(line) + « n ») full_metrics = { « classification »: { « accuracy »: acc, « precision »: prec, « recall »: rec, « f1 »: f1, « confusion_matrix »: confusion_matrix(refs, preds).tolist(), « report »: report }, « explanation_scores »: { « BLEU_avg »: sum(bleu_scores) / len(bleu_scores), « ROUGE-L_avg »: sum(rouge_scores) / len(rouge_scores), } } with open(os.path.join(output_dir, f »eval_metrics_{now}.json »), « w ») as f: json.dump(full_metrics, f, indent=2) print(f »nClassification Accuracy: {acc:.4f} ») print(f »Explanation Scores: ») print(f » BLEU: {full_metrics[‘explanation_scores’][‘BLEU_avg’]:.4f} ») print(f » ROUGE-L: {full_metrics[‘explanation_scores’][‘ROUGE-L_avg’]:.4f} ») print(f »nSaved to: {output_dir} ») log_path = os.path.join(args.output_dir, « metrics_log.jsonl ») epoch_log = { « epoch »: epoch, « accuracy »: acc, « precision »: prec, « recall »: rec, « f1 »: f1, « bleu »: full_metrics[« explanation_scores »][« BLEU_avg »], « rougeL »: full_metrics[« explanation_scores »][« ROUGE-L_avg »], } with open(log_path, « a ») as f: f.write(json.dumps(epoch_log) + « n ») return control
def main(): MODEL_NAME = « meta-llama/Meta-Llama-3.1-8B-Instruct » OUTPUT_DIR = « out » TRAIN_FILE = « data/train_instruct.json » EVAL_FILE = « data/eval_instruct.json »
USE_BF16 = True LORA_RANK = 8 MAX_LEN = 2048 MAX_NEW_TOKENS = 100 BATCH_SIZE = 1 GRAD_ACC = 8 NUM_EPOCHS = 3 LEARNING_RATE = 2e-4 SEED = 47 model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.bfloat16, trust_remote_code=True, ) model = prepare_model_for_kbit_training(model) model = get_peft_model(model, peft_config) dataset = load_dataset(« json », data_files={« train »: TRAIN_FILE, « eval »: EVAL_FILE}) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token def tokenize(example): prompt = format_prompt(example[« input »]) full = prompt + example[« output »] tokenized = tokenizer(full, truncation=True, padding= »max_length », max_length=MAX_LEN) tokenized[« labels »] = tokenized[« input_ids »].copy() return tokenized tokenized_ds = dataset.map(tokenize, remove_columns=[« input », « output »]) args = TrainingArguments( output_dir=OUTPUT_DIR, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, gradient_accumulation_steps=GRAD_ACC, gradient_checkpointing=True, num_train_epochs=NUM_EPOCHS, learning_rate=LEARNING_RATE, logging_steps=10, save_strategy= »epoch », eval_strategy= »epoch », do_train=True, do_eval=True, bf16=USE_BF16, seed=SEED, report_to= »none », save_safetensors=True, ddp_timeout=180000000, lr_scheduler_type= »cosine », warmup_ratio=0.1, save_total_limit=2, load_best_model_at_end=True, ) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True) trainer = Trainer( model=model, processing_class=tokenizer, args=args, train_dataset=tokenized_ds[« train »], eval_dataset=dataset[« eval »], data_collator=data_collator, callbacks=[CustomEvalCallback()], ) trainer.train() model.save_pretrained(f »{OUTPUT_DIR}/final ») tokenizer.save_pretrained(f »{OUTPUT_DIR}/final »)
if name == « main« : main() « `
I’m really just interested in a code example that allows me to run the fine-tuning on multi-gpus and run custom callbacks after each epoch.
I’m a very beginner and learning as I go so please be kind :).
submitted by /u/-_-JustALurker-_- to r/learnmachinelearning
[link] [comments]
Laisser un commentaire