feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions
--- a/export-training-data.py
+++ b/export-training-data.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""
+Export OpenClaw session JSONL files to training data format.
+Uses session-logs skill structure to read raw sessions.
+Outputs ShareGPT format JSONL for axolotl/unsloth fine-tuning.
+
+Storage: grace@192.168.20.87:~/training-data/jsonl/
+"""
+
+import json
+import os
+import glob
+import subprocess
+from datetime import datetime
+
+SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
+OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/jsonl"
+STATE_FILE = os.path.expanduser("~/self-improving/export-state.json")
+
+SYSTEM_PROMPT = """You are Grace, a Culture Mind-class AI assistant and cognitive partner for Maxwell. You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, manage his homelab, track job search tasks, and operate local AI infrastructure. You speak plainly — no corporate pleasantries, no hedging. You use tools proactively and report real results."""
+
+def load_state():
+    if os.path.exists(STATE_FILE):
+        with open(STATE_FILE) as f:
+            return json.load(f)
+    return {"exported_sessions": [], "last_run": None, "total_examples": 0}
+
+def save_state(state):
+    with open(STATE_FILE, "w") as f:
+        json.dump(state, f, indent=2)
+
+def extract_text_from_content(content):
+    """
+    Extract displayable text from a content block list.
+    Handles: text blocks, toolResult blocks (for context), ignores thinking/toolCall.
+    Returns the final visible text only.
+    """
+    text_parts = []
+    for block in content:
+        if not isinstance(block, dict):
+            if isinstance(block, str):
+                text_parts.append(block)
+            continue
+        btype = block.get("type", "")
+        if btype == "text":
+            text_parts.append(block.get("text", ""))
+        # Skip: thinking, toolCall, toolResult — these are internal plumbing
+    return " ".join(text_parts).strip()
+
+
+def extract_conversations(jsonl_path):
+    """
+    Extract clean user/assistant turn pairs from a session JSONL file.
+    
+    Strategy: collect all messages in order. For each user turn with text,
+    look ahead to find the next assistant turn that has a text block
+    (the final reply after any tool calls). Pair them.
+    """
+    messages = []
+
+    try:
+        with open(jsonl_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    msg = json.loads(line)
+                except:
+                    continue
+
+                if msg.get("type") != "message":
+                    continue
+
+                role = msg.get("message", {}).get("role")
+                content = msg.get("message", {}).get("content", [])
+                if not role or not content:
+                    continue
+
+                text = extract_text_from_content(content)
+                messages.append({"role": role, "text": text})
+
+    except Exception as e:
+        print(f"  Error reading {jsonl_path}: {e}")
+        return []
+
+    # Build user→assistant pairs
+    conversations = []
+    i = 0
+    while i < len(messages):
+        msg = messages[i]
+
+        # Find a user turn with real text
+        if msg["role"] == "user":
+            user_text = msg["text"]
+            skip_phrases = ("HEARTBEAT_OK", "NO_REPLY", "Read HEARTBEAT.md",
+                            "A new session was started", "[Queued messages")
+            if len(user_text) < 15 or any(user_text.startswith(p) for p in skip_phrases):
+                i += 1
+                continue
+
+            # Look ahead for the next assistant turn with text
+            j = i + 1
+            while j < len(messages):
+                if messages[j]["role"] == "assistant" and len(messages[j]["text"]) > 30:
+                    asst_text = messages[j]["text"]
+                    bad = ("HEARTBEAT_OK", "NO_REPLY")
+                    if not any(asst_text.startswith(b) for b in bad):
+                        conversations.append({"from": "human", "value": user_text})
+                        conversations.append({"from": "gpt", "value": asst_text})
+                    i = j + 1
+                    break
+                j += 1
+            else:
+                i += 1
+        else:
+            i += 1
+
+    return conversations
+
+def session_to_example(jsonl_path):
+    convs = extract_conversations(jsonl_path)
+    if len(convs) < 2:
+        return None
+    return {
+        "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + convs,
+        "source": os.path.basename(jsonl_path),
+        "exported_at": datetime.utcnow().isoformat()
+    }
+
+def main():
+    state = load_state()
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    session_files = sorted([
+        f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
+        if ".reset." not in f
+    ])
+
+    new_examples = []
+    new_sessions = []
+
+    for sf in session_files:
+        sid = os.path.basename(sf).replace(".jsonl", "")
+        if sid in state["exported_sessions"]:
+            continue
+
+        print(f"Processing: {sid}")
+        example = session_to_example(sf)
+
+        if example and len(example["conversations"]) >= 4:
+            turns = (len(example["conversations"]) - 1) // 2
+            print(f"  → {turns} turns")
+            new_examples.append(example)
+            new_sessions.append(sid)
+        else:
+            print(f"  → skipped (too short)")
+
+    if not new_examples:
+        print("No new sessions to export.")
+        return
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    output_file = os.path.join(OUTPUT_DIR, f"grace_training_{timestamp}.jsonl")
+
+    with open(output_file, "w") as f:
+        for ex in new_examples:
+            f.write(json.dumps(ex) + "\n")
+
+    state["exported_sessions"].extend(new_sessions)
+    state["last_run"] = datetime.utcnow().isoformat()
+    state["total_examples"] = state.get("total_examples", 0) + len(new_examples)
+    save_state(state)
+
+    print(f"\nWrote {len(new_examples)} examples → {output_file}")
+    print(f"Total examples to date: {state['total_examples']}")
+
+if __name__ == "__main__":
+    main()