grace-scripts/convert-training-data.py

#!/usr/bin/env python3
"""
Convert raw Grace session exports to clean unsloth/axolotl-ready JSONL.

Input:  ~/training-data/jsonl/grace_training_*.jsonl  (ShareGPT format, raw)
Output: ~/training-data/cleaned/grace_clean_YYYYMMDD.jsonl  (clean ShareGPT)
        ~/training-data/dpo/grace_dpo_YYYYMMDD.jsonl        (DPO pairs, if any)

Storage: copies to grace@192.168.20.87:~/training-data/cleaned/
"""

import json
import os
import re
import glob
import subprocess
from datetime import datetime, timezone

NFS_BASE   = "/mnt/ai-storage/grace/training-data"
INPUT_DIR  = os.path.join(NFS_BASE, "jsonl")
CLEAN_DIR  = os.path.join(NFS_BASE, "cleaned")
DPO_DIR    = os.path.join(NFS_BASE, "dpo")
STATE_FILE = os.path.expanduser("~/self-improving/convert-state.json")

# Noise patterns to strip from user turns
NOISE_PATTERNS = [
    re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
    re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
    re.compile(r'<relevant-memories>.*?</relevant-memories>', re.DOTALL),
    re.compile(r'\[media attached:.*?\]'),
    re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL),
]

# Minimum quality thresholds
MIN_TURNS = 2          # minimum user/assistant exchanges
MIN_ASSISTANT_CHARS = 80  # skip very short assistant replies
MAX_ASSISTANT_CHARS = 8000  # skip extremely long tool-dump responses

# Strings that indicate a low-quality assistant turn
BAD_PATTERNS = [
    "HEARTBEAT_OK", "NO_REPLY",
    "<tool_call>",          # raw tool call leaked into response
    "Let me check",         # placeholder with no follow-through
]

SYSTEM_PROMPT = (
    "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. "
    "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
    "manage his homelab, help with job searching, and operate local AI infrastructure. "
    "You speak plainly — no corporate pleasantries, no hedging. "
    "You use exec and local tools proactively and return real results. Never fabricate output."
)


def load_state():
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE) as f:
            return json.load(f)
    return {"converted_files": [], "total_clean": 0, "total_dpo": 0}


def save_state(state):
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)


def clean_text(text: str) -> str:
    """Strip metadata noise from a user turn."""
    for pattern in NOISE_PATTERNS:
        text = pattern.sub("", text)
    # Collapse excess whitespace
    text = re.sub(r'\n{3,}', '\n\n', text).strip()
    return text


def is_good_assistant_turn(text: str) -> bool:
    if len(text) < MIN_ASSISTANT_CHARS:
        return False
    if len(text) > MAX_ASSISTANT_CHARS:
        return False
    for bad in BAD_PATTERNS:
        if bad in text:
            return False
    return True


def extract_dpo_pairs(conversations: list) -> list:
    """
    Look for correction signals in the conversation.
    A correction pair = assistant turn followed by a user message
    that signals disagreement, then a better assistant turn.
    Signals: "no,", "actually", "that's wrong", "you should", "remember"
    """
    correction_signals = ["no,", "actually", "that's wrong", "you should have",
                          "remember that", "i told you", "stop doing", "wrong"]
    pairs = []
    convs = [c for c in conversations if c["from"] in ("human", "gpt")]

    for i in range(1, len(convs) - 1):
        if convs[i]["from"] != "human":
            continue
        user_text = convs[i]["value"].lower()
        if not any(sig in user_text for sig in correction_signals):
            continue
        if i < 1 or convs[i-1]["from"] != "gpt":
            continue
        if i + 1 >= len(convs) or convs[i+1]["from"] != "gpt":
            continue

        rejected = convs[i-1]["value"]
        chosen   = convs[i+1]["value"]

        # Only worth it if the chosen is meaningfully different
        if len(chosen) > 50 and chosen != rejected:
            pairs.append({
                "prompt":   convs[i-2]["value"] if i >= 2 else "",
                "chosen":   chosen,
                "rejected": rejected,
                "context":  convs[i]["value"],  # the correction itself
            })
    return pairs


def process_file(path: str):
    """
    Process one raw export JSONL file.
    Returns (clean_examples, dpo_pairs).
    """
    clean_examples = []
    dpo_pairs = []

    with open(path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                example = json.loads(line)
            except json.JSONDecodeError:
                continue

            raw_convs = example.get("conversations", [])

            # Separate system prompt from turns
            turns = [c for c in raw_convs if c.get("from") != "system"]

            # Clean user turns
            cleaned = []
            for turn in turns:
                if turn["from"] == "human":
                    text = clean_text(turn["value"])
                    if len(text) < 5:
                        continue
                    cleaned.append({"from": "human", "value": text})
                elif turn["from"] == "gpt":
                    text = turn["value"].strip()
                    if not is_good_assistant_turn(text):
                        continue
                    cleaned.append({"from": "gpt", "value": text})

            # Build valid human/gpt pairs only
            valid_turns = []
            i = 0
            while i < len(cleaned) - 1:
                if cleaned[i]["from"] == "human" and cleaned[i+1]["from"] == "gpt":
                    valid_turns.append(cleaned[i])
                    valid_turns.append(cleaned[i+1])
                    i += 2
                else:
                    i += 1

            if len(valid_turns) < MIN_TURNS * 2:
                continue

            clean_examples.append({
                "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + valid_turns,
                "source": os.path.basename(path),
                "converted_at": datetime.now(timezone.utc).isoformat(),
            })

            # Extract DPO pairs from this example
            dpo_pairs.extend(extract_dpo_pairs(valid_turns))

    return clean_examples, dpo_pairs


def scp_file(local_path: str, remote_subdir: str):
    # Data is on NFS — already written directly, no SCP needed
    print(f"  Written to NFS: {local_path}")


def main():
    os.makedirs(CLEAN_DIR, exist_ok=True)
    os.makedirs(DPO_DIR, exist_ok=True)

    state = load_state()
    input_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.jsonl")))
    new_files = [f for f in input_files if os.path.basename(f) not in state["converted_files"]]

    if not new_files:
        print("No new files to convert.")
        return

    all_clean = []
    all_dpo = []

    for f in new_files:
        print(f"Processing: {os.path.basename(f)}")
        clean, dpo = process_file(f)
        print(f"  → {len(clean)} clean examples, {len(dpo)} DPO pairs")
        all_clean.extend(clean)
        all_dpo.extend(dpo)
        state["converted_files"].append(os.path.basename(f))

    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

    if all_clean:
        clean_file = os.path.join(CLEAN_DIR, f"grace_clean_{timestamp}.jsonl")
        with open(clean_file, "w") as f:
            for ex in all_clean:
                f.write(json.dumps(ex) + "\n")
        print(f"\nWrote {len(all_clean)} clean examples → {clean_file}")
        scp_file(clean_file, "cleaned")
        state["total_clean"] = state.get("total_clean", 0) + len(all_clean)

    if all_dpo:
        dpo_file = os.path.join(DPO_DIR, f"grace_dpo_{timestamp}.jsonl")
        with open(dpo_file, "w") as f:
            for pair in all_dpo:
                f.write(json.dumps(pair) + "\n")
        print(f"Wrote {len(all_dpo)} DPO pairs → {dpo_file}")
        scp_file(dpo_file, "dpo")
        state["total_dpo"] = state.get("total_dpo", 0) + len(all_dpo)

    save_state(state)
    print(f"\nTotals to date: {state['total_clean']} clean, {state['total_dpo']} DPO pairs")


if __name__ == "__main__":
    main()