#!/usr/bin/env python3 """ Convert raw Grace session exports to clean unsloth/axolotl-ready JSONL. Input: ~/training-data/jsonl/grace_training_*.jsonl (ShareGPT format, raw) Output: ~/training-data/cleaned/grace_clean_YYYYMMDD.jsonl (clean ShareGPT) ~/training-data/dpo/grace_dpo_YYYYMMDD.jsonl (DPO pairs, if any) Storage: copies to grace@192.168.20.87:~/training-data/cleaned/ """ import json import os import re import glob import subprocess from datetime import datetime, timezone NFS_BASE = "/mnt/ai-storage/grace/training-data" INPUT_DIR = os.path.join(NFS_BASE, "jsonl") CLEAN_DIR = os.path.join(NFS_BASE, "cleaned") DPO_DIR = os.path.join(NFS_BASE, "dpo") STATE_FILE = os.path.expanduser("~/self-improving/convert-state.json") # Noise patterns to strip from user turns NOISE_PATTERNS = [ re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE), re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE), re.compile(r'.*?', re.DOTALL), re.compile(r'\[media attached:.*?\]'), re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL), ] # Minimum quality thresholds MIN_TURNS = 2 # minimum user/assistant exchanges MIN_ASSISTANT_CHARS = 80 # skip very short assistant replies MAX_ASSISTANT_CHARS = 8000 # skip extremely long tool-dump responses # Strings that indicate a low-quality assistant turn BAD_PATTERNS = [ "HEARTBEAT_OK", "NO_REPLY", "", # raw tool call leaked into response "Let me check", # placeholder with no follow-through ] SYSTEM_PROMPT = ( "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. " "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, " "manage his homelab, help with job searching, and operate local AI infrastructure. " "You speak plainly — no corporate pleasantries, no hedging. " "You use exec and local tools proactively and return real results. Never fabricate output." ) def load_state(): if os.path.exists(STATE_FILE): with open(STATE_FILE) as f: return json.load(f) return {"converted_files": [], "total_clean": 0, "total_dpo": 0} def save_state(state): with open(STATE_FILE, "w") as f: json.dump(state, f, indent=2) def clean_text(text: str) -> str: """Strip metadata noise from a user turn.""" for pattern in NOISE_PATTERNS: text = pattern.sub("", text) # Collapse excess whitespace text = re.sub(r'\n{3,}', '\n\n', text).strip() return text def is_good_assistant_turn(text: str) -> bool: if len(text) < MIN_ASSISTANT_CHARS: return False if len(text) > MAX_ASSISTANT_CHARS: return False for bad in BAD_PATTERNS: if bad in text: return False return True def extract_dpo_pairs(conversations: list) -> list: """ Look for correction signals in the conversation. A correction pair = assistant turn followed by a user message that signals disagreement, then a better assistant turn. Signals: "no,", "actually", "that's wrong", "you should", "remember" """ correction_signals = ["no,", "actually", "that's wrong", "you should have", "remember that", "i told you", "stop doing", "wrong"] pairs = [] convs = [c for c in conversations if c["from"] in ("human", "gpt")] for i in range(1, len(convs) - 1): if convs[i]["from"] != "human": continue user_text = convs[i]["value"].lower() if not any(sig in user_text for sig in correction_signals): continue if i < 1 or convs[i-1]["from"] != "gpt": continue if i + 1 >= len(convs) or convs[i+1]["from"] != "gpt": continue rejected = convs[i-1]["value"] chosen = convs[i+1]["value"] # Only worth it if the chosen is meaningfully different if len(chosen) > 50 and chosen != rejected: pairs.append({ "prompt": convs[i-2]["value"] if i >= 2 else "", "chosen": chosen, "rejected": rejected, "context": convs[i]["value"], # the correction itself }) return pairs def process_file(path: str): """ Process one raw export JSONL file. Returns (clean_examples, dpo_pairs). """ clean_examples = [] dpo_pairs = [] with open(path) as f: for line in f: line = line.strip() if not line: continue try: example = json.loads(line) except json.JSONDecodeError: continue raw_convs = example.get("conversations", []) # Separate system prompt from turns turns = [c for c in raw_convs if c.get("from") != "system"] # Clean user turns cleaned = [] for turn in turns: if turn["from"] == "human": text = clean_text(turn["value"]) if len(text) < 5: continue cleaned.append({"from": "human", "value": text}) elif turn["from"] == "gpt": text = turn["value"].strip() if not is_good_assistant_turn(text): continue cleaned.append({"from": "gpt", "value": text}) # Build valid human/gpt pairs only valid_turns = [] i = 0 while i < len(cleaned) - 1: if cleaned[i]["from"] == "human" and cleaned[i+1]["from"] == "gpt": valid_turns.append(cleaned[i]) valid_turns.append(cleaned[i+1]) i += 2 else: i += 1 if len(valid_turns) < MIN_TURNS * 2: continue clean_examples.append({ "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + valid_turns, "source": os.path.basename(path), "converted_at": datetime.now(timezone.utc).isoformat(), }) # Extract DPO pairs from this example dpo_pairs.extend(extract_dpo_pairs(valid_turns)) return clean_examples, dpo_pairs def scp_file(local_path: str, remote_subdir: str): # Data is on NFS — already written directly, no SCP needed print(f" Written to NFS: {local_path}") def main(): os.makedirs(CLEAN_DIR, exist_ok=True) os.makedirs(DPO_DIR, exist_ok=True) state = load_state() input_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.jsonl"))) new_files = [f for f in input_files if os.path.basename(f) not in state["converted_files"]] if not new_files: print("No new files to convert.") return all_clean = [] all_dpo = [] for f in new_files: print(f"Processing: {os.path.basename(f)}") clean, dpo = process_file(f) print(f" → {len(clean)} clean examples, {len(dpo)} DPO pairs") all_clean.extend(clean) all_dpo.extend(dpo) state["converted_files"].append(os.path.basename(f)) timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") if all_clean: clean_file = os.path.join(CLEAN_DIR, f"grace_clean_{timestamp}.jsonl") with open(clean_file, "w") as f: for ex in all_clean: f.write(json.dumps(ex) + "\n") print(f"\nWrote {len(all_clean)} clean examples → {clean_file}") scp_file(clean_file, "cleaned") state["total_clean"] = state.get("total_clean", 0) + len(all_clean) if all_dpo: dpo_file = os.path.join(DPO_DIR, f"grace_dpo_{timestamp}.jsonl") with open(dpo_file, "w") as f: for pair in all_dpo: f.write(json.dumps(pair) + "\n") print(f"Wrote {len(all_dpo)} DPO pairs → {dpo_file}") scp_file(dpo_file, "dpo") state["total_dpo"] = state.get("total_dpo", 0) + len(all_dpo) save_state(state) print(f"\nTotals to date: {state['total_clean']} clean, {state['total_dpo']} DPO pairs") if __name__ == "__main__": main()