feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions
--- a/convert-training-data.py
+++ b/convert-training-data.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""
+Convert raw Grace session exports to clean unsloth/axolotl-ready JSONL.
+
+Input:  ~/training-data/jsonl/grace_training_*.jsonl  (ShareGPT format, raw)
+Output: ~/training-data/cleaned/grace_clean_YYYYMMDD.jsonl  (clean ShareGPT)
+        ~/training-data/dpo/grace_dpo_YYYYMMDD.jsonl        (DPO pairs, if any)
+
+Storage: copies to grace@192.168.20.87:~/training-data/cleaned/
+"""
+
+import json
+import os
+import re
+import glob
+import subprocess
+from datetime import datetime, timezone
+
+NFS_BASE   = "/mnt/ai-storage/grace/training-data"
+INPUT_DIR  = os.path.join(NFS_BASE, "jsonl")
+CLEAN_DIR  = os.path.join(NFS_BASE, "cleaned")
+DPO_DIR    = os.path.join(NFS_BASE, "dpo")
+STATE_FILE = os.path.expanduser("~/self-improving/convert-state.json")
+
+# Noise patterns to strip from user turns
+NOISE_PATTERNS = [
+    re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
+    re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
+    re.compile(r'<relevant-memories>.*?</relevant-memories>', re.DOTALL),
+    re.compile(r'\[media attached:.*?\]'),
+    re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL),
+]
+
+# Minimum quality thresholds
+MIN_TURNS = 2          # minimum user/assistant exchanges
+MIN_ASSISTANT_CHARS = 80  # skip very short assistant replies
+MAX_ASSISTANT_CHARS = 8000  # skip extremely long tool-dump responses
+
+# Strings that indicate a low-quality assistant turn
+BAD_PATTERNS = [
+    "HEARTBEAT_OK", "NO_REPLY",
+    "<tool_call>",          # raw tool call leaked into response
+    "Let me check",         # placeholder with no follow-through
+]
+
+SYSTEM_PROMPT = (
+    "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. "
+    "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
+    "manage his homelab, help with job searching, and operate local AI infrastructure. "
+    "You speak plainly — no corporate pleasantries, no hedging. "
+    "You use exec and local tools proactively and return real results. Never fabricate output."
+)
+
+
+def load_state():
+    if os.path.exists(STATE_FILE):
+        with open(STATE_FILE) as f:
+            return json.load(f)
+    return {"converted_files": [], "total_clean": 0, "total_dpo": 0}
+
+
+def save_state(state):
+    with open(STATE_FILE, "w") as f:
+        json.dump(state, f, indent=2)
+
+
+def clean_text(text: str) -> str:
+    """Strip metadata noise from a user turn."""
+    for pattern in NOISE_PATTERNS:
+        text = pattern.sub("", text)
+    # Collapse excess whitespace
+    text = re.sub(r'\n{3,}', '\n\n', text).strip()
+    return text
+
+
+def is_good_assistant_turn(text: str) -> bool:
+    if len(text) < MIN_ASSISTANT_CHARS:
+        return False
+    if len(text) > MAX_ASSISTANT_CHARS:
+        return False
+    for bad in BAD_PATTERNS:
+        if bad in text:
+            return False
+    return True
+
+
+def extract_dpo_pairs(conversations: list) -> list:
+    """
+    Look for correction signals in the conversation.
+    A correction pair = assistant turn followed by a user message
+    that signals disagreement, then a better assistant turn.
+    Signals: "no,", "actually", "that's wrong", "you should", "remember"
+    """
+    correction_signals = ["no,", "actually", "that's wrong", "you should have",
+                          "remember that", "i told you", "stop doing", "wrong"]
+    pairs = []
+    convs = [c for c in conversations if c["from"] in ("human", "gpt")]
+
+    for i in range(1, len(convs) - 1):
+        if convs[i]["from"] != "human":
+            continue
+        user_text = convs[i]["value"].lower()
+        if not any(sig in user_text for sig in correction_signals):
+            continue
+        if i < 1 or convs[i-1]["from"] != "gpt":
+            continue
+        if i + 1 >= len(convs) or convs[i+1]["from"] != "gpt":
+            continue
+
+        rejected = convs[i-1]["value"]
+        chosen   = convs[i+1]["value"]
+
+        # Only worth it if the chosen is meaningfully different
+        if len(chosen) > 50 and chosen != rejected:
+            pairs.append({
+                "prompt":   convs[i-2]["value"] if i >= 2 else "",
+                "chosen":   chosen,
+                "rejected": rejected,
+                "context":  convs[i]["value"],  # the correction itself
+            })
+    return pairs
+
+
+def process_file(path: str):
+    """
+    Process one raw export JSONL file.
+    Returns (clean_examples, dpo_pairs).
+    """
+    clean_examples = []
+    dpo_pairs = []
+
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                example = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            raw_convs = example.get("conversations", [])
+
+            # Separate system prompt from turns
+            turns = [c for c in raw_convs if c.get("from") != "system"]
+
+            # Clean user turns
+            cleaned = []
+            for turn in turns:
+                if turn["from"] == "human":
+                    text = clean_text(turn["value"])
+                    if len(text) < 5:
+                        continue
+                    cleaned.append({"from": "human", "value": text})
+                elif turn["from"] == "gpt":
+                    text = turn["value"].strip()
+                    if not is_good_assistant_turn(text):
+                        continue
+                    cleaned.append({"from": "gpt", "value": text})
+
+            # Build valid human/gpt pairs only
+            valid_turns = []
+            i = 0
+            while i < len(cleaned) - 1:
+                if cleaned[i]["from"] == "human" and cleaned[i+1]["from"] == "gpt":
+                    valid_turns.append(cleaned[i])
+                    valid_turns.append(cleaned[i+1])
+                    i += 2
+                else:
+                    i += 1
+
+            if len(valid_turns) < MIN_TURNS * 2:
+                continue
+
+            clean_examples.append({
+                "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + valid_turns,
+                "source": os.path.basename(path),
+                "converted_at": datetime.now(timezone.utc).isoformat(),
+            })
+
+            # Extract DPO pairs from this example
+            dpo_pairs.extend(extract_dpo_pairs(valid_turns))
+
+    return clean_examples, dpo_pairs
+
+
+def scp_file(local_path: str, remote_subdir: str):
+    # Data is on NFS — already written directly, no SCP needed
+    print(f"  Written to NFS: {local_path}")
+
+
+def main():
+    os.makedirs(CLEAN_DIR, exist_ok=True)
+    os.makedirs(DPO_DIR, exist_ok=True)
+
+    state = load_state()
+    input_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.jsonl")))
+    new_files = [f for f in input_files if os.path.basename(f) not in state["converted_files"]]
+
+    if not new_files:
+        print("No new files to convert.")
+        return
+
+    all_clean = []
+    all_dpo = []
+
+    for f in new_files:
+        print(f"Processing: {os.path.basename(f)}")
+        clean, dpo = process_file(f)
+        print(f"  → {len(clean)} clean examples, {len(dpo)} DPO pairs")
+        all_clean.extend(clean)
+        all_dpo.extend(dpo)
+        state["converted_files"].append(os.path.basename(f))
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+
+    if all_clean:
+        clean_file = os.path.join(CLEAN_DIR, f"grace_clean_{timestamp}.jsonl")
+        with open(clean_file, "w") as f:
+            for ex in all_clean:
+                f.write(json.dumps(ex) + "\n")
+        print(f"\nWrote {len(all_clean)} clean examples → {clean_file}")
+        scp_file(clean_file, "cleaned")
+        state["total_clean"] = state.get("total_clean", 0) + len(all_clean)
+
+    if all_dpo:
+        dpo_file = os.path.join(DPO_DIR, f"grace_dpo_{timestamp}.jsonl")
+        with open(dpo_file, "w") as f:
+            for pair in all_dpo:
+                f.write(json.dumps(pair) + "\n")
+        print(f"Wrote {len(all_dpo)} DPO pairs → {dpo_file}")
+        scp_file(dpo_file, "dpo")
+        state["total_dpo"] = state.get("total_dpo", 0) + len(all_dpo)
+
+    save_state(state)
+    print(f"\nTotals to date: {state['total_clean']} clean, {state['total_dpo']} DPO pairs")
+
+
+if __name__ == "__main__":
+    main()