#!/usr/bin/env python3 """ Grace LoRA fine-tuning script using Unsloth. Model: Qwen3-8B (loaded from local GGUF or HuggingFace) GPU: GPU 1 (GTX 1080, 8GB VRAM) — GPU 0 reserved for live inference Data: ~/training-data/cleaned/ (SFT) ~/training-data/dpo/ (DPO preference pairs) Usage: source ~/unsloth-env/bin/activate python3 ~/scripts/finetune-lora.py [--dpo] [--dry-run] Output: ~/models/grace-lora-YYYYMMDD/ (LoRA adapter) Copied to grace@192.168.20.87:~/models/ IMPORTANT: Do not run this until we have 200+ clean examples. Current count is tracked in ~/self-improving/convert-state.json """ import argparse import glob import json import os import shutil import subprocess from datetime import datetime, timezone # ── Config ────────────────────────────────────────────────────────────────── NFS_BASE = "/mnt/ai-storage/grace" CLEAN_DIR = os.path.join(NFS_BASE, "training-data/cleaned") DPO_DIR = os.path.join(NFS_BASE, "training-data/dpo") OUTPUT_BASE = os.path.join(NFS_BASE, "models") # Use GPU 1 only — GPU 0 is running inference os.environ["CUDA_VISIBLE_DEVICES"] = "1" # LoRA hyperparameters (safe defaults for Qwen3-8B on 8GB VRAM) LORA_CONFIG = { "r": 16, "lora_alpha": 16, "lora_dropout": 0.1, "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], } # SFT training config SFT_CONFIG = { "learning_rate": 1e-4, "num_train_epochs": 1, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 4, "max_seq_length": 8192, "kl_coef": 0.05, # KL anchor — keeps adapter close to base "warmup_ratio": 0.05, "lr_scheduler_type": "cosine", "fp16": False, "bf16": True, # GTX 1080 doesn't support bf16 natively — falls back to fp16 } # DPO config DPO_CONFIG = { "beta": 0.1, # start conservative; sweep upward if needed "learning_rate": 2e-5, "num_train_epochs": 1, "per_device_train_batch_size": 1, "gradient_accumulation_steps": 8, "max_length": 4096, } MIN_EXAMPLES_SFT = 200 MIN_EXAMPLES_DPO = 50 def count_examples(directory: str) -> int: total = 0 for f in glob.glob(os.path.join(directory, "*.jsonl")): with open(f) as fh: total += sum(1 for line in fh if line.strip()) return total def load_dataset_from_dir(directory: str): """Load all JSONL files in a directory into a HuggingFace dataset.""" from datasets import load_dataset files = sorted(glob.glob(os.path.join(directory, "*.jsonl"))) if not files: raise FileNotFoundError(f"No JSONL files found in {directory}") return load_dataset("json", data_files=files, split="train") def run_sft(model, tokenizer, output_dir: str, dry_run: bool = False): """Run SFT with KL anchor using unsloth + TRL.""" from trl import SFTTrainer, SFTConfig from unsloth.chat_templates import get_chat_template tokenizer = get_chat_template(tokenizer, chat_template="qwen-3") dataset = load_dataset_from_dir(CLEAN_DIR) print(f"SFT dataset: {len(dataset)} examples") if dry_run: print("[DRY RUN] Would train SFT on", len(dataset), "examples") return def format_chat(example): convs = example["conversations"] text = tokenizer.apply_chat_template( convs, tokenize=False, add_generation_prompt=False ) return {"text": text} dataset = dataset.map(format_chat, remove_columns=dataset.column_names) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, args=SFTConfig( output_dir=output_dir, learning_rate=SFT_CONFIG["learning_rate"], num_train_epochs=SFT_CONFIG["num_train_epochs"], per_device_train_batch_size=SFT_CONFIG["per_device_train_batch_size"], gradient_accumulation_steps=SFT_CONFIG["gradient_accumulation_steps"], max_seq_length=SFT_CONFIG["max_seq_length"], warmup_ratio=SFT_CONFIG["warmup_ratio"], lr_scheduler_type=SFT_CONFIG["lr_scheduler_type"], fp16=SFT_CONFIG["fp16"], bf16=SFT_CONFIG["bf16"], dataset_text_field="text", save_strategy="epoch", logging_steps=10, ), ) trainer.train() model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print(f"SFT adapter saved → {output_dir}") def run_dpo(model, tokenizer, output_dir: str, dry_run: bool = False): """Run DPO preference tuning.""" from trl import DPOTrainer, DPOConfig dataset = load_dataset_from_dir(DPO_DIR) print(f"DPO dataset: {len(dataset)} pairs") if dry_run: print("[DRY RUN] Would train DPO on", len(dataset), "pairs") return trainer = DPOTrainer( model=model, ref_model=None, # unsloth handles reference model internally tokenizer=tokenizer, train_dataset=dataset, args=DPOConfig( output_dir=output_dir + "-dpo", beta=DPO_CONFIG["beta"], learning_rate=DPO_CONFIG["learning_rate"], num_train_epochs=DPO_CONFIG["num_train_epochs"], per_device_train_batch_size=DPO_CONFIG["per_device_train_batch_size"], gradient_accumulation_steps=DPO_CONFIG["gradient_accumulation_steps"], max_length=DPO_CONFIG["max_length"], fp16=True, save_strategy="epoch", logging_steps=5, ), ) trainer.train() model.save_pretrained(output_dir + "-dpo") print(f"DPO adapter saved → {output_dir}-dpo") def main(): parser = argparse.ArgumentParser(description="Grace LoRA fine-tuning") parser.add_argument("--dpo", action="store_true", help="Also run DPO after SFT") parser.add_argument("--dry-run", action="store_true", help="Check data counts, don't train") args = parser.parse_args() # ── Pre-flight checks ── sft_count = count_examples(CLEAN_DIR) dpo_count = count_examples(DPO_DIR) print(f"Training data: {sft_count} SFT examples, {dpo_count} DPO pairs") if sft_count < MIN_EXAMPLES_SFT: print(f"⚠️ Not enough SFT data yet ({sft_count}/{MIN_EXAMPLES_SFT} minimum).") print(" Keep having conversations with Grace — the exporter runs nightly.") if not args.dry_run: return if args.dpo and dpo_count < MIN_EXAMPLES_DPO: print(f"⚠️ Not enough DPO pairs yet ({dpo_count}/{MIN_EXAMPLES_DPO} minimum). Skipping DPO.") args.dpo = False if args.dry_run: print("\n[DRY RUN] Pre-flight check complete. Run without --dry-run to train.") return # ── Load model ── from unsloth import FastLanguageModel timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") output_dir = os.path.join(OUTPUT_BASE, f"grace-lora-{timestamp}") os.makedirs(output_dir, exist_ok=True) print("\nLoading Qwen3-8B with unsloth (4-bit, GPU 1)...") model, tokenizer = FastLanguageModel.from_pretrained( model_name="Qwen/Qwen3-8B-Instruct", max_seq_length=SFT_CONFIG["max_seq_length"], dtype=None, # auto-detect load_in_4bit=True, # QLoRA — fits in 8GB VRAM ) model = FastLanguageModel.get_peft_model( model, **LORA_CONFIG, bias="none", use_gradient_checkpointing="unsloth", ) # ── SFT ── print("\n=== Stage 1: SFT with KL anchor ===") run_sft(model, tokenizer, output_dir, dry_run=args.dry_run) # ── DPO (optional) ── if args.dpo: print("\n=== Stage 2: DPO preference tuning ===") run_dpo(model, tokenizer, output_dir, dry_run=args.dry_run) # Output is already on NFS — no copy needed print(f"\nAdapter saved to NFS: {output_dir}") # ── Save run metadata ── meta = { "timestamp": timestamp, "output_dir": output_dir, "sft_examples": sft_count, "dpo_pairs": dpo_count if args.dpo else 0, "lora_config": LORA_CONFIG, "sft_config": SFT_CONFIG, "dpo_config": DPO_CONFIG if args.dpo else None, } with open(os.path.join(output_dir, "run-meta.json"), "w") as f: json.dump(meta, f, indent=2) print(f"\n✅ Done. Adapter at: {output_dir}") print("To use: load the adapter with llama.cpp --lora or swap into docker-compose") if __name__ == "__main__": main()