#!/usr/bin/env python3 """ Improved session extractor that handles tool-interleaved conversations. Session structure in OpenClaw JSONL: user: question (text) assistant: narration + toolCall blocks <- intermediate, not training data user: toolResult blocks <- intermediate assistant: final answer (text) <- THIS is what we want Strategy: for each user text turn, find the LAST assistant text turn before the next user text turn. That's the real response. """ import json import os import re import glob from datetime import datetime, timezone # Noise patterns to strip from raw user messages NOISE_PATTERNS = [ re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE), re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE), re.compile(r'.*?', re.DOTALL), re.compile(r'\[media attached:.*?\]'), re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL), re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL), re.compile(r'\[Queued messages while agent was busy\].*?(?=\n\n|\Z)', re.DOTALL), re.compile(r'---\s*\nQueued #\d+\s*\n'), re.compile(r'^```\s*\n```\s*\n', re.MULTILINE), # empty code blocks ] def clean_user_text(text: str) -> str: for pattern in NOISE_PATTERNS: text = pattern.sub("", text) text = re.sub(r'\n{3,}', '\n\n', text).strip() # Strip leading/trailing backtick remnants text = re.sub(r'^[\s`]+', '', text).strip() return text SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/") OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/jsonl" STATE_FILE = os.path.expanduser("~/self-improving/export-state.json") SYSTEM_PROMPT = ( "You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. " "You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, " "manage his homelab, help with job searching, and operate local AI infrastructure. " "You speak plainly — no corporate pleasantries, no hedging. " "You use exec and local tools proactively and return real results. Never fabricate output." ) SKIP_USER = ( "HEARTBEAT_OK", "NO_REPLY", "Read HEARTBEAT.md", "A new session was started", "[Queued messages", "Run your Session Startup", "Conversation info (untrusted", ) SKIP_ASSISTANT = ( "HEARTBEAT_OK", "NO_REPLY", "✅ New session started", ) MIN_USER_CHARS = 15 MIN_ASST_CHARS = 40 def get_text(content: list) -> str: parts = [] for block in content: if isinstance(block, dict) and block.get("type") == "text": parts.append(block.get("text", "")) elif isinstance(block, str): parts.append(block) return " ".join(parts).strip() def has_tool_result(content: list) -> bool: """Check if this message is a tool result (not a real user message).""" for b in content: if isinstance(b, dict): btype = b.get("type", "") if btype in ("toolResult", "tool_result"): return True # Also catches the pattern where the whole content is a tool result object if btype == "text": text = b.get("text", "") # Tool results often start with JSON or exec output if text.startswith('{\n "url":') or text.startswith('{\n "error":'): return True return False def is_tool_result_message(content: list) -> bool: """ Detect tool result messages: user messages that are purely tool output, not real human input. These have no real text — just raw tool return values. Heuristic: if content has text but it looks like structured tool output. """ raw = get_text(content) # Pure tool result patterns if raw.startswith('{\n "') and '"url"' in raw[:50]: return True if raw.startswith("---\nname:") or raw.startswith("# ") and len(raw) > 500: return True # Multi-line with no conversational content (exec output etc.) lines = raw.strip().split('\n') if len(lines) > 3 and not any(c.isalpha() and c.islower() for c in raw[:30]): return True return False def extract_pairs(path: str): """ Parse session, return list of (user_text, assistant_text) pairs. Each pair = real human question + final assistant answer (after tool use). """ messages = [] try: with open(path) as f: for line in f: line = line.strip() if not line: continue try: msg = json.loads(line) except Exception: continue if msg.get("type") != "message": continue role = msg.get("message", {}).get("role") content = msg.get("message", {}).get("content", []) if role and content: messages.append({"role": role, "content": content}) except Exception as e: print(f" Error: {e}") return [] # Group into segments: each starts with a user text turn # A "user text turn" is a user message with actual text (not just toolResults) pairs = [] i = 0 while i < len(messages): msg = messages[i] # Find a user turn with real text if msg["role"] != "user": i += 1 continue user_text = get_text(msg["content"]) is_tool_result = has_tool_result(msg["content"]) if is_tool_result: i += 1 continue # Clean metadata noise from user text user_text = clean_user_text(user_text) if len(user_text) < MIN_USER_CHARS: i += 1 continue if any(user_text.startswith(s) for s in SKIP_USER): i += 1 continue # Look forward: collect all assistant text turns until the next real user turn # The LAST non-empty assistant text before the next real user = the answer j = i + 1 last_asst_text = None next_real_user = None while j < len(messages): m = messages[j] if m["role"] == "user": u_text = get_text(m["content"]) u_is_tool = has_tool_result(m["content"]) if not u_is_tool and len(u_text) >= MIN_USER_CHARS and not any(u_text.startswith(s) for s in SKIP_USER): next_real_user = j break elif m["role"] == "assistant": t = get_text(m["content"]) if len(t) >= MIN_ASST_CHARS and not any(t.startswith(s) for s in SKIP_ASSISTANT): last_asst_text = t j += 1 if last_asst_text: pairs.append((user_text, last_asst_text)) # Jump to next real user turn i = next_real_user if next_real_user is not None else j return pairs def load_state(): if os.path.exists(STATE_FILE): with open(STATE_FILE) as f: return json.load(f) return {"exported_sessions": [], "last_run": None, "total_examples": 0} def save_state(state): with open(STATE_FILE, "w") as f: json.dump(state, f, indent=2) def main(): state = load_state() os.makedirs(OUTPUT_DIR, exist_ok=True) session_files = sorted([ f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl")) if ".reset." not in f ]) new_examples = [] new_sessions = [] for sf in session_files: sid = os.path.basename(sf).replace(".jsonl", "") if sid in state["exported_sessions"]: continue pairs = extract_pairs(sf) if len(pairs) < 2: print(f" {sid[:8]}: skipped ({len(pairs)} pairs)") state["exported_sessions"].append(sid) continue conversations = [{"from": "system", "value": SYSTEM_PROMPT}] for user_text, asst_text in pairs: conversations.append({"from": "human", "value": user_text}) conversations.append({"from": "gpt", "value": asst_text}) new_examples.append({ "conversations": conversations, "source": os.path.basename(sf), "exported_at": datetime.now(timezone.utc).isoformat(), }) new_sessions.append(sid) print(f" {sid[:8]}: {len(pairs)} pairs ✓") if not new_examples: print("No new sessions to export.") state["exported_sessions"].extend(new_sessions) save_state(state) return timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") output_file = os.path.join(OUTPUT_DIR, f"grace_training_{timestamp}.jsonl") with open(output_file, "w") as f: for ex in new_examples: f.write(json.dumps(ex) + "\n") state["exported_sessions"].extend(new_sessions) state["last_run"] = datetime.now(timezone.utc).isoformat() state["total_examples"] = state.get("total_examples", 0) + len(new_examples) save_state(state) print(f"\nWrote {len(new_examples)} examples → {output_file}") print(f"Total examples to date: {state['total_examples']}") if __name__ == "__main__": main()