feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions
--- a/export-transcripts.py
+++ b/export-transcripts.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+"""
+Verbatim transcript exporter for Grace training data.
+
+Reads all OpenClaw session JSONL files and saves clean, full-conversation
+transcripts to NFS. Strips metadata envelopes from user messages but
+preserves all content verbatim.
+
+Output format: ShareGPT JSONL (system + alternating human/gpt turns)
+Storage: /mnt/ai-storage/grace/training-data/transcripts/
+"""
+
+import json
+import os
+import re
+import glob
+from datetime import datetime, timezone
+
+SESSIONS_DIR  = os.path.expanduser("~/.openclaw/agents/main/sessions/")
+OUTPUT_DIR    = "/mnt/ai-storage/grace/training-data/transcripts"
+STATE_FILE    = os.path.expanduser("~/self-improving/transcript-state.json")
+
+SYSTEM_PROMPT = (
+    "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell Burton. "
+    "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
+    "manage his homelab, help with job searching, and operate local AI infrastructure. "
+    "You speak plainly — no corporate pleasantries, no hedging. "
+    "You use exec and local tools proactively and return real results. Never fabricate output."
+)
+
+# Noise to strip from user turns (metadata envelopes, not content)
+NOISE_PATTERNS = [
+    re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
+    re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
+    re.compile(r'<relevant-memories>.*?</relevant-memories>\s*\n?', re.DOTALL),
+    re.compile(r'\[media attached:.*?\]\s*\n?'),
+    re.compile(r'To send an image back, prefer the message tool.*?\n', re.DOTALL),
+    re.compile(r'```json\s*\{\s*"schema"\s*:.*?\}\s*```\s*\n?', re.DOTALL),
+    re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL),
+    re.compile(r'\[Queued messages while agent was busy\]\s*\n', re.DOTALL),
+    re.compile(r'---\s*\nQueued #\d+\s*\n'),
+    re.compile(r'^```\s*\n```\s*\n?', re.MULTILINE),
+]
+
+# Full turns to skip entirely
+SKIP_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
+SKIP_STARTSWITH = (
+    "A new session was started via /new or /reset.",
+    "Read HEARTBEAT.md if it exists",
+    "Run your Session Startup",
+)
+
+# Skip assistant turns that are just internal narration with no value
+SKIP_ASST_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
+SKIP_ASST_STARTSWITH = (
+    "✅ New session started",
+)
+
+
+def clean_user(text: str) -> str:
+    for p in NOISE_PATTERNS:
+        text = p.sub("", text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = re.sub(r'^[\s`]+', '', text)
+    return text.strip()
+
+
+def get_text(content: list) -> str:
+    parts = []
+    for block in content:
+        if isinstance(block, dict) and block.get("type") == "text":
+            t = block.get("text", "").strip()
+            if t:
+                parts.append(t)
+        elif isinstance(block, str) and block.strip():
+            parts.append(block.strip())
+    return "\n".join(parts).strip()
+
+
+def is_tool_result_msg(content: list) -> bool:
+    """True if this user message is a tool result, not a human turn."""
+    return any(
+        isinstance(b, dict) and b.get("type") in ("toolResult", "tool_result")
+        for b in content
+    )
+
+
+def extract_transcript(path: str) -> list:
+    """
+    Extract full verbatim conversation as list of {from, value} dicts.
+    Preserves all turns — doesn't filter by quality or length.
+    Only removes metadata noise and skips tool result messages.
+    """
+    turns = []
+
+    try:
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    msg = json.loads(line)
+                except Exception:
+                    continue
+
+                if msg.get("type") != "message":
+                    continue
+
+                role = msg.get("message", {}).get("role")
+                content = msg.get("message", {}).get("content", [])
+
+                if not role or not content:
+                    continue
+
+                if role == "user":
+                    # Skip tool result messages
+                    if is_tool_result_msg(content):
+                        continue
+
+                    text = clean_user(get_text(content))
+
+                    if not text:
+                        continue
+                    if text in SKIP_EXACT:
+                        continue
+                    if any(text.startswith(s) for s in SKIP_STARTSWITH):
+                        continue
+
+                    turns.append({"from": "human", "value": text})
+
+                elif role == "assistant":
+                    text = get_text(content)
+
+                    if not text:
+                        continue
+                    if text in SKIP_ASST_EXACT:
+                        continue
+                    if any(text.startswith(s) for s in SKIP_ASST_STARTSWITH):
+                        continue
+
+                    turns.append({"from": "gpt", "value": text})
+
+    except Exception as e:
+        print(f"  Error reading {path}: {e}")
+        return []
+
+    # Ensure turns alternate properly (drop consecutive same-role turns,
+    # keeping the last assistant turn before a role switch)
+    clean = []
+    for turn in turns:
+        if clean and clean[-1]["from"] == turn["from"]:
+            if turn["from"] == "gpt":
+                # Replace with the later (more complete) assistant turn
+                clean[-1] = turn
+            # For consecutive user turns, keep both (queued messages)
+            else:
+                clean.append(turn)
+        else:
+            clean.append(turn)
+
+    return clean
+
+
+def load_state():
+    if os.path.exists(STATE_FILE):
+        with open(STATE_FILE) as f:
+            return json.load(f)
+    return {"exported_sessions": [], "last_run": None, "total_turns": 0}
+
+
+def save_state(state):
+    with open(STATE_FILE, "w") as f:
+        json.dump(state, f, indent=2)
+
+
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    state = load_state()
+
+    session_files = sorted([
+        f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
+        if ".reset." not in f
+    ])
+
+    new_examples = []
+    new_sessions = []
+    total_new_turns = 0
+
+    for sf in session_files:
+        sid = os.path.basename(sf).replace(".jsonl", "")
+        if sid in state["exported_sessions"]:
+            continue
+
+        turns = extract_transcript(sf)
+
+        # Need at least 4 turns (2 exchanges) to be useful
+        if len(turns) < 4:
+            print(f"  {sid[:8]}: skipped ({len(turns)} turns)")
+            state["exported_sessions"].append(sid)
+            continue
+
+        example = {
+            "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + turns,
+            "source": os.path.basename(sf),
+            "turn_count": len(turns),
+            "exported_at": datetime.now(timezone.utc).isoformat(),
+        }
+        new_examples.append(example)
+        new_sessions.append(sid)
+        total_new_turns += len(turns)
+        print(f"  {sid[:8]}: {len(turns)} turns ✓")
+
+    if not new_examples:
+        print("No new sessions to export.")
+        state["exported_sessions"].extend(new_sessions)
+        save_state(state)
+        return
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+    output_file = os.path.join(OUTPUT_DIR, f"grace_transcript_{timestamp}.jsonl")
+
+    with open(output_file, "w") as f:
+        for ex in new_examples:
+            f.write(json.dumps(ex) + "\n")
+
+    state["exported_sessions"].extend(new_sessions)
+    state["last_run"] = datetime.now(timezone.utc).isoformat()
+    state["total_turns"] = state.get("total_turns", 0) + total_new_turns
+    save_state(state)
+
+    print(f"\nWrote {len(new_examples)} transcripts ({total_new_turns} turns) → {output_file}")
+    print(f"Total turns to date: {state['total_turns']}")
+
+
+if __name__ == "__main__":
+    main()