grace-scripts/export-transcripts.py

#!/usr/bin/env python3
"""
Verbatim transcript exporter for Grace training data.

Reads all OpenClaw session JSONL files and saves clean, full-conversation
transcripts to NFS. Strips metadata envelopes from user messages but
preserves all content verbatim.

Output format: ShareGPT JSONL (system + alternating human/gpt turns)
Storage: /mnt/ai-storage/grace/training-data/transcripts/
"""

import json
import os
import re
import glob
from datetime import datetime, timezone

SESSIONS_DIR  = os.path.expanduser("~/.openclaw/agents/main/sessions/")
OUTPUT_DIR    = "/mnt/ai-storage/grace/training-data/transcripts"
STATE_FILE    = os.path.expanduser("~/self-improving/transcript-state.json")

SYSTEM_PROMPT = (
    "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell Burton. "
    "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
    "manage his homelab, help with job searching, and operate local AI infrastructure. "
    "You speak plainly — no corporate pleasantries, no hedging. "
    "You use exec and local tools proactively and return real results. Never fabricate output."
)

# Noise to strip from user turns (metadata envelopes, not content)
NOISE_PATTERNS = [
    re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
    re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
    re.compile(r'<relevant-memories>.*?</relevant-memories>\s*\n?', re.DOTALL),
    re.compile(r'\[media attached:.*?\]\s*\n?'),
    re.compile(r'To send an image back, prefer the message tool.*?\n', re.DOTALL),
    re.compile(r'```json\s*\{\s*"schema"\s*:.*?\}\s*```\s*\n?', re.DOTALL),
    re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL),
    re.compile(r'\[Queued messages while agent was busy\]\s*\n', re.DOTALL),
    re.compile(r'---\s*\nQueued #\d+\s*\n'),
    re.compile(r'^```\s*\n```\s*\n?', re.MULTILINE),
]

# Full turns to skip entirely
SKIP_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
SKIP_STARTSWITH = (
    "A new session was started via /new or /reset.",
    "Read HEARTBEAT.md if it exists",
    "Run your Session Startup",
)

# Skip assistant turns that are just internal narration with no value
SKIP_ASST_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
SKIP_ASST_STARTSWITH = (
    "✅ New session started",
)


def clean_user(text: str) -> str:
    for p in NOISE_PATTERNS:
        text = p.sub("", text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'^[\s`]+', '', text)
    return text.strip()


def get_text(content: list) -> str:
    parts = []
    for block in content:
        if isinstance(block, dict) and block.get("type") == "text":
            t = block.get("text", "").strip()
            if t:
                parts.append(t)
        elif isinstance(block, str) and block.strip():
            parts.append(block.strip())
    return "\n".join(parts).strip()


def is_tool_result_msg(content: list) -> bool:
    """True if this user message is a tool result, not a human turn."""
    return any(
        isinstance(b, dict) and b.get("type") in ("toolResult", "tool_result")
        for b in content
    )


def extract_transcript(path: str) -> list:
    """
    Extract full verbatim conversation as list of {from, value} dicts.
    Preserves all turns — doesn't filter by quality or length.
    Only removes metadata noise and skips tool result messages.
    """
    turns = []

    try:
        with open(path) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    msg = json.loads(line)
                except Exception:
                    continue

                if msg.get("type") != "message":
                    continue

                role = msg.get("message", {}).get("role")
                content = msg.get("message", {}).get("content", [])

                if not role or not content:
                    continue

                if role == "user":
                    # Skip tool result messages
                    if is_tool_result_msg(content):
                        continue

                    text = clean_user(get_text(content))

                    if not text:
                        continue
                    if text in SKIP_EXACT:
                        continue
                    if any(text.startswith(s) for s in SKIP_STARTSWITH):
                        continue

                    turns.append({"from": "human", "value": text})

                elif role == "assistant":
                    text = get_text(content)

                    if not text:
                        continue
                    if text in SKIP_ASST_EXACT:
                        continue
                    if any(text.startswith(s) for s in SKIP_ASST_STARTSWITH):
                        continue

                    turns.append({"from": "gpt", "value": text})

    except Exception as e:
        print(f"  Error reading {path}: {e}")
        return []

    # Ensure turns alternate properly (drop consecutive same-role turns,
    # keeping the last assistant turn before a role switch)
    clean = []
    for turn in turns:
        if clean and clean[-1]["from"] == turn["from"]:
            if turn["from"] == "gpt":
                # Replace with the later (more complete) assistant turn
                clean[-1] = turn
            # For consecutive user turns, keep both (queued messages)
            else:
                clean.append(turn)
        else:
            clean.append(turn)

    return clean


def load_state():
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE) as f:
            return json.load(f)
    return {"exported_sessions": [], "last_run": None, "total_turns": 0}


def save_state(state):
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    state = load_state()

    session_files = sorted([
        f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
        if ".reset." not in f
    ])

    new_examples = []
    new_sessions = []
    total_new_turns = 0

    for sf in session_files:
        sid = os.path.basename(sf).replace(".jsonl", "")
        if sid in state["exported_sessions"]:
            continue

        turns = extract_transcript(sf)

        # Need at least 4 turns (2 exchanges) to be useful
        if len(turns) < 4:
            print(f"  {sid[:8]}: skipped ({len(turns)} turns)")
            state["exported_sessions"].append(sid)
            continue

        example = {
            "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + turns,
            "source": os.path.basename(sf),
            "turn_count": len(turns),
            "exported_at": datetime.now(timezone.utc).isoformat(),
        }
        new_examples.append(example)
        new_sessions.append(sid)
        total_new_turns += len(turns)
        print(f"  {sid[:8]}: {len(turns)} turns ✓")

    if not new_examples:
        print("No new sessions to export.")
        state["exported_sessions"].extend(new_sessions)
        save_state(state)
        return

    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    output_file = os.path.join(OUTPUT_DIR, f"grace_transcript_{timestamp}.jsonl")

    with open(output_file, "w") as f:
        for ex in new_examples:
            f.write(json.dumps(ex) + "\n")

    state["exported_sessions"].extend(new_sessions)
    state["last_run"] = datetime.now(timezone.utc).isoformat()
    state["total_turns"] = state.get("total_turns", 0) + total_new_turns
    save_state(state)

    print(f"\nWrote {len(new_examples)} transcripts ({total_new_turns} turns) → {output_file}")
    print(f"Total turns to date: {state['total_turns']}")


if __name__ == "__main__":
    main()