feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions
--- a/extract-sessions.py
+++ b/extract-sessions.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+Improved session extractor that handles tool-interleaved conversations.
+
+Session structure in OpenClaw JSONL:
+  user: question (text)
+  assistant: narration + toolCall blocks  <- intermediate, not training data
+  user: toolResult blocks                 <- intermediate
+  assistant: final answer (text)          <- THIS is what we want
+
+Strategy: for each user text turn, find the LAST assistant text turn
+before the next user text turn. That's the real response.
+"""
+
+import json
+import os
+import re
+import glob
+from datetime import datetime, timezone
+
+# Noise patterns to strip from raw user messages
+NOISE_PATTERNS = [
+    re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
+    re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
+    re.compile(r'<relevant-memories>.*?</relevant-memories>', re.DOTALL),
+    re.compile(r'\[media attached:.*?\]'),
+    re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL),
+    re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL),
+    re.compile(r'\[Queued messages while agent was busy\].*?(?=\n\n|\Z)', re.DOTALL),
+    re.compile(r'---\s*\nQueued #\d+\s*\n'),
+    re.compile(r'^```\s*\n```\s*\n', re.MULTILINE),  # empty code blocks
+]
+
+
+def clean_user_text(text: str) -> str:
+    for pattern in NOISE_PATTERNS:
+        text = pattern.sub("", text)
+    text = re.sub(r'\n{3,}', '\n\n', text).strip()
+    # Strip leading/trailing backtick remnants
+    text = re.sub(r'^[\s`]+', '', text).strip()
+    return text
+
+SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
+OUTPUT_DIR   = "/mnt/ai-storage/grace/training-data/jsonl"
+STATE_FILE   = os.path.expanduser("~/self-improving/export-state.json")
+
+SYSTEM_PROMPT = (
+    "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. "
+    "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
+    "manage his homelab, help with job searching, and operate local AI infrastructure. "
+    "You speak plainly — no corporate pleasantries, no hedging. "
+    "You use exec and local tools proactively and return real results. Never fabricate output."
+)
+
+SKIP_USER = (
+    "HEARTBEAT_OK", "NO_REPLY", "Read HEARTBEAT.md",
+    "A new session was started", "[Queued messages",
+    "Run your Session Startup", "Conversation info (untrusted",
+)
+
+SKIP_ASSISTANT = (
+    "HEARTBEAT_OK", "NO_REPLY",
+    "✅ New session started",
+)
+
+MIN_USER_CHARS = 15
+MIN_ASST_CHARS = 40
+
+
+def get_text(content: list) -> str:
+    parts = []
+    for block in content:
+        if isinstance(block, dict) and block.get("type") == "text":
+            parts.append(block.get("text", ""))
+        elif isinstance(block, str):
+            parts.append(block)
+    return " ".join(parts).strip()
+
+
+def has_tool_result(content: list) -> bool:
+    """Check if this message is a tool result (not a real user message)."""
+    for b in content:
+        if isinstance(b, dict):
+            btype = b.get("type", "")
+            if btype in ("toolResult", "tool_result"):
+                return True
+            # Also catches the pattern where the whole content is a tool result object
+            if btype == "text":
+                text = b.get("text", "")
+                # Tool results often start with JSON or exec output
+                if text.startswith('{\n  "url":') or text.startswith('{\n  "error":'):
+                    return True
+    return False
+
+
+def is_tool_result_message(content: list) -> bool:
+    """
+    Detect tool result messages: user messages that are purely tool output,
+    not real human input. These have no real text — just raw tool return values.
+    Heuristic: if content has text but it looks like structured tool output.
+    """
+    raw = get_text(content)
+    # Pure tool result patterns
+    if raw.startswith('{\n  "') and '"url"' in raw[:50]:
+        return True
+    if raw.startswith("---\nname:") or raw.startswith("# ") and len(raw) > 500:
+        return True
+    # Multi-line with no conversational content (exec output etc.)
+    lines = raw.strip().split('\n')
+    if len(lines) > 3 and not any(c.isalpha() and c.islower() for c in raw[:30]):
+        return True
+    return False
+
+
+def extract_pairs(path: str):
+    """
+    Parse session, return list of (user_text, assistant_text) pairs.
+    Each pair = real human question + final assistant answer (after tool use).
+    """
+    messages = []
+    try:
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    msg = json.loads(line)
+                except Exception:
+                    continue
+                if msg.get("type") != "message":
+                    continue
+                role = msg.get("message", {}).get("role")
+                content = msg.get("message", {}).get("content", [])
+                if role and content:
+                    messages.append({"role": role, "content": content})
+    except Exception as e:
+        print(f"  Error: {e}")
+        return []
+
+    # Group into segments: each starts with a user text turn
+    # A "user text turn" is a user message with actual text (not just toolResults)
+    pairs = []
+    i = 0
+    while i < len(messages):
+        msg = messages[i]
+
+        # Find a user turn with real text
+        if msg["role"] != "user":
+            i += 1
+            continue
+
+        user_text = get_text(msg["content"])
+        is_tool_result = has_tool_result(msg["content"])
+
+        if is_tool_result:
+            i += 1
+            continue
+
+        # Clean metadata noise from user text
+        user_text = clean_user_text(user_text)
+
+        if len(user_text) < MIN_USER_CHARS:
+            i += 1
+            continue
+
+        if any(user_text.startswith(s) for s in SKIP_USER):
+            i += 1
+            continue
+
+        # Look forward: collect all assistant text turns until the next real user turn
+        # The LAST non-empty assistant text before the next real user = the answer
+        j = i + 1
+        last_asst_text = None
+        next_real_user = None
+
+        while j < len(messages):
+            m = messages[j]
+            if m["role"] == "user":
+                u_text = get_text(m["content"])
+                u_is_tool = has_tool_result(m["content"])
+                if not u_is_tool and len(u_text) >= MIN_USER_CHARS and not any(u_text.startswith(s) for s in SKIP_USER):
+                    next_real_user = j
+                    break
+            elif m["role"] == "assistant":
+                t = get_text(m["content"])
+                if len(t) >= MIN_ASST_CHARS and not any(t.startswith(s) for s in SKIP_ASSISTANT):
+                    last_asst_text = t
+            j += 1
+
+        if last_asst_text:
+            pairs.append((user_text, last_asst_text))
+
+        # Jump to next real user turn
+        i = next_real_user if next_real_user is not None else j
+
+    return pairs
+
+
+def load_state():
+    if os.path.exists(STATE_FILE):
+        with open(STATE_FILE) as f:
+            return json.load(f)
+    return {"exported_sessions": [], "last_run": None, "total_examples": 0}
+
+
+def save_state(state):
+    with open(STATE_FILE, "w") as f:
+        json.dump(state, f, indent=2)
+
+
+def main():
+    state = load_state()
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    session_files = sorted([
+        f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
+        if ".reset." not in f
+    ])
+
+    new_examples = []
+    new_sessions = []
+
+    for sf in session_files:
+        sid = os.path.basename(sf).replace(".jsonl", "")
+        if sid in state["exported_sessions"]:
+            continue
+
+        pairs = extract_pairs(sf)
+        if len(pairs) < 2:
+            print(f"  {sid[:8]}: skipped ({len(pairs)} pairs)")
+            state["exported_sessions"].append(sid)
+            continue
+
+        conversations = [{"from": "system", "value": SYSTEM_PROMPT}]
+        for user_text, asst_text in pairs:
+            conversations.append({"from": "human", "value": user_text})
+            conversations.append({"from": "gpt",   "value": asst_text})
+
+        new_examples.append({
+            "conversations": conversations,
+            "source": os.path.basename(sf),
+            "exported_at": datetime.now(timezone.utc).isoformat(),
+        })
+        new_sessions.append(sid)
+        print(f"  {sid[:8]}: {len(pairs)} pairs ✓")
+
+    if not new_examples:
+        print("No new sessions to export.")
+        state["exported_sessions"].extend(new_sessions)
+        save_state(state)
+        return
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+    output_file = os.path.join(OUTPUT_DIR, f"grace_training_{timestamp}.jsonl")
+    with open(output_file, "w") as f:
+        for ex in new_examples:
+            f.write(json.dumps(ex) + "\n")
+
+    state["exported_sessions"].extend(new_sessions)
+    state["last_run"] = datetime.now(timezone.utc).isoformat()
+    state["total_examples"] = state.get("total_examples", 0) + len(new_examples)
+    save_state(state)
+
+    print(f"\nWrote {len(new_examples)} examples → {output_file}")
+    print(f"Total examples to date: {state['total_examples']}")
+
+
+if __name__ == "__main__":
+    main()