#!/usr/bin/env python3 """ Export OpenClaw session JSONL files to training data format. Uses session-logs skill structure to read raw sessions. Outputs ShareGPT format JSONL for axolotl/unsloth fine-tuning. Storage: grace@192.168.20.87:~/training-data/jsonl/ """ import json import os import glob import subprocess from datetime import datetime SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/") OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/jsonl" STATE_FILE = os.path.expanduser("~/self-improving/export-state.json") SYSTEM_PROMPT = """You are Grace, a Culture Mind-class AI assistant and cognitive partner for Maxwell. You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, manage his homelab, track job search tasks, and operate local AI infrastructure. You speak plainly — no corporate pleasantries, no hedging. You use tools proactively and report real results.""" def load_state(): if os.path.exists(STATE_FILE): with open(STATE_FILE) as f: return json.load(f) return {"exported_sessions": [], "last_run": None, "total_examples": 0} def save_state(state): with open(STATE_FILE, "w") as f: json.dump(state, f, indent=2) def extract_text_from_content(content): """ Extract displayable text from a content block list. Handles: text blocks, toolResult blocks (for context), ignores thinking/toolCall. Returns the final visible text only. """ text_parts = [] for block in content: if not isinstance(block, dict): if isinstance(block, str): text_parts.append(block) continue btype = block.get("type", "") if btype == "text": text_parts.append(block.get("text", "")) # Skip: thinking, toolCall, toolResult — these are internal plumbing return " ".join(text_parts).strip() def extract_conversations(jsonl_path): """ Extract clean user/assistant turn pairs from a session JSONL file. Strategy: collect all messages in order. For each user turn with text, look ahead to find the next assistant turn that has a text block (the final reply after any tool calls). Pair them. """ messages = [] try: with open(jsonl_path) as f: for line in f: line = line.strip() if not line: continue try: msg = json.loads(line) except: continue if msg.get("type") != "message": continue role = msg.get("message", {}).get("role") content = msg.get("message", {}).get("content", []) if not role or not content: continue text = extract_text_from_content(content) messages.append({"role": role, "text": text}) except Exception as e: print(f" Error reading {jsonl_path}: {e}") return [] # Build user→assistant pairs conversations = [] i = 0 while i < len(messages): msg = messages[i] # Find a user turn with real text if msg["role"] == "user": user_text = msg["text"] skip_phrases = ("HEARTBEAT_OK", "NO_REPLY", "Read HEARTBEAT.md", "A new session was started", "[Queued messages") if len(user_text) < 15 or any(user_text.startswith(p) for p in skip_phrases): i += 1 continue # Look ahead for the next assistant turn with text j = i + 1 while j < len(messages): if messages[j]["role"] == "assistant" and len(messages[j]["text"]) > 30: asst_text = messages[j]["text"] bad = ("HEARTBEAT_OK", "NO_REPLY") if not any(asst_text.startswith(b) for b in bad): conversations.append({"from": "human", "value": user_text}) conversations.append({"from": "gpt", "value": asst_text}) i = j + 1 break j += 1 else: i += 1 else: i += 1 return conversations def session_to_example(jsonl_path): convs = extract_conversations(jsonl_path) if len(convs) < 2: return None return { "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + convs, "source": os.path.basename(jsonl_path), "exported_at": datetime.utcnow().isoformat() } def main(): state = load_state() os.makedirs(OUTPUT_DIR, exist_ok=True) session_files = sorted([ f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl")) if ".reset." not in f ]) new_examples = [] new_sessions = [] for sf in session_files: sid = os.path.basename(sf).replace(".jsonl", "") if sid in state["exported_sessions"]: continue print(f"Processing: {sid}") example = session_to_example(sf) if example and len(example["conversations"]) >= 4: turns = (len(example["conversations"]) - 1) // 2 print(f" → {turns} turns") new_examples.append(example) new_sessions.append(sid) else: print(f" → skipped (too short)") if not new_examples: print("No new sessions to export.") return os.makedirs(OUTPUT_DIR, exist_ok=True) timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") output_file = os.path.join(OUTPUT_DIR, f"grace_training_{timestamp}.jsonl") with open(output_file, "w") as f: for ex in new_examples: f.write(json.dumps(ex) + "\n") state["exported_sessions"].extend(new_sessions) state["last_run"] = datetime.utcnow().isoformat() state["total_examples"] = state.get("total_examples", 0) + len(new_examples) save_state(state) print(f"\nWrote {len(new_examples)} examples → {output_file}") print(f"Total examples to date: {state['total_examples']}") if __name__ == "__main__": main()