#!/usr/bin/env python3 """ Verbatim transcript exporter for Grace training data. Reads all OpenClaw session JSONL files and saves clean, full-conversation transcripts to NFS. Strips metadata envelopes from user messages but preserves all content verbatim. Output format: ShareGPT JSONL (system + alternating human/gpt turns) Storage: /mnt/ai-storage/grace/training-data/transcripts/ """ import json import os import re import glob from datetime import datetime, timezone SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/") OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/transcripts" STATE_FILE = os.path.expanduser("~/self-improving/transcript-state.json") SYSTEM_PROMPT = ( "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell Burton. " "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, " "manage his homelab, help with job searching, and operate local AI infrastructure. " "You speak plainly — no corporate pleasantries, no hedging. " "You use exec and local tools proactively and return real results. Never fabricate output." ) # Noise to strip from user turns (metadata envelopes, not content) NOISE_PATTERNS = [ re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE), re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE), re.compile(r'.*?\s*\n?', re.DOTALL), re.compile(r'\[media attached:.*?\]\s*\n?'), re.compile(r'To send an image back, prefer the message tool.*?\n', re.DOTALL), re.compile(r'```json\s*\{\s*"schema"\s*:.*?\}\s*```\s*\n?', re.DOTALL), re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL), re.compile(r'\[Queued messages while agent was busy\]\s*\n', re.DOTALL), re.compile(r'---\s*\nQueued #\d+\s*\n'), re.compile(r'^```\s*\n```\s*\n?', re.MULTILINE), ] # Full turns to skip entirely SKIP_EXACT = {"HEARTBEAT_OK", "NO_REPLY"} SKIP_STARTSWITH = ( "A new session was started via /new or /reset.", "Read HEARTBEAT.md if it exists", "Run your Session Startup", ) # Skip assistant turns that are just internal narration with no value SKIP_ASST_EXACT = {"HEARTBEAT_OK", "NO_REPLY"} SKIP_ASST_STARTSWITH = ( "✅ New session started", ) def clean_user(text: str) -> str: for p in NOISE_PATTERNS: text = p.sub("", text) text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r'^[\s`]+', '', text) return text.strip() def get_text(content: list) -> str: parts = [] for block in content: if isinstance(block, dict) and block.get("type") == "text": t = block.get("text", "").strip() if t: parts.append(t) elif isinstance(block, str) and block.strip(): parts.append(block.strip()) return "\n".join(parts).strip() def is_tool_result_msg(content: list) -> bool: """True if this user message is a tool result, not a human turn.""" return any( isinstance(b, dict) and b.get("type") in ("toolResult", "tool_result") for b in content ) def extract_transcript(path: str) -> list: """ Extract full verbatim conversation as list of {from, value} dicts. Preserves all turns — doesn't filter by quality or length. Only removes metadata noise and skips tool result messages. """ turns = [] try: with open(path) as f: for line in f: line = line.strip() if not line: continue try: msg = json.loads(line) except Exception: continue if msg.get("type") != "message": continue role = msg.get("message", {}).get("role") content = msg.get("message", {}).get("content", []) if not role or not content: continue if role == "user": # Skip tool result messages if is_tool_result_msg(content): continue text = clean_user(get_text(content)) if not text: continue if text in SKIP_EXACT: continue if any(text.startswith(s) for s in SKIP_STARTSWITH): continue turns.append({"from": "human", "value": text}) elif role == "assistant": text = get_text(content) if not text: continue if text in SKIP_ASST_EXACT: continue if any(text.startswith(s) for s in SKIP_ASST_STARTSWITH): continue turns.append({"from": "gpt", "value": text}) except Exception as e: print(f" Error reading {path}: {e}") return [] # Ensure turns alternate properly (drop consecutive same-role turns, # keeping the last assistant turn before a role switch) clean = [] for turn in turns: if clean and clean[-1]["from"] == turn["from"]: if turn["from"] == "gpt": # Replace with the later (more complete) assistant turn clean[-1] = turn # For consecutive user turns, keep both (queued messages) else: clean.append(turn) else: clean.append(turn) return clean def load_state(): if os.path.exists(STATE_FILE): with open(STATE_FILE) as f: return json.load(f) return {"exported_sessions": [], "last_run": None, "total_turns": 0} def save_state(state): with open(STATE_FILE, "w") as f: json.dump(state, f, indent=2) def main(): os.makedirs(OUTPUT_DIR, exist_ok=True) state = load_state() session_files = sorted([ f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl")) if ".reset." not in f ]) new_examples = [] new_sessions = [] total_new_turns = 0 for sf in session_files: sid = os.path.basename(sf).replace(".jsonl", "") if sid in state["exported_sessions"]: continue turns = extract_transcript(sf) # Need at least 4 turns (2 exchanges) to be useful if len(turns) < 4: print(f" {sid[:8]}: skipped ({len(turns)} turns)") state["exported_sessions"].append(sid) continue example = { "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + turns, "source": os.path.basename(sf), "turn_count": len(turns), "exported_at": datetime.now(timezone.utc).isoformat(), } new_examples.append(example) new_sessions.append(sid) total_new_turns += len(turns) print(f" {sid[:8]}: {len(turns)} turns ✓") if not new_examples: print("No new sessions to export.") state["exported_sessions"].extend(new_sessions) save_state(state) return timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") output_file = os.path.join(OUTPUT_DIR, f"grace_transcript_{timestamp}.jsonl") with open(output_file, "w") as f: for ex in new_examples: f.write(json.dumps(ex) + "\n") state["exported_sessions"].extend(new_sessions) state["last_run"] = datetime.now(timezone.utc).isoformat() state["total_turns"] = state.get("total_turns", 0) + total_new_turns save_state(state) print(f"\nWrote {len(new_examples)} transcripts ({total_new_turns} turns) → {output_file}") print(f"Total turns to date: {state['total_turns']}") if __name__ == "__main__": main()