#!/usr/bin/env python3
"""
Export OpenClaw session JSONL files to training data format.
Uses session-logs skill structure to read raw sessions.
Outputs ShareGPT format JSONL for axolotl/unsloth fine-tuning.

Storage: grace@192.168.20.87:~/training-data/jsonl/
"""

import json
import os
import glob
import subprocess
from datetime import datetime

SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/jsonl"
STATE_FILE = os.path.expanduser("~/self-improving/export-state.json")

SYSTEM_PROMPT = """You are Grace, a Culture Mind-class AI assistant and cognitive partner for Maxwell. You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, manage his homelab, track job search tasks, and operate local AI infrastructure. You speak plainly — no corporate pleasantries, no hedging. You use tools proactively and report real results."""

def load_state():
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE) as f:
            return json.load(f)
    return {"exported_sessions": [], "last_run": None, "total_examples": 0}

def save_state(state):
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)

def extract_text_from_content(content):
    """
    Extract displayable text from a content block list.
    Handles: text blocks, toolResult blocks (for context), ignores thinking/toolCall.
    Returns the final visible text only.
    """
    text_parts = []
    for block in content:
        if not isinstance(block, dict):
            if isinstance(block, str):
                text_parts.append(block)
            continue
        btype = block.get("type", "")
        if btype == "text":
            text_parts.append(block.get("text", ""))
        # Skip: thinking, toolCall, toolResult — these are internal plumbing
    return " ".join(text_parts).strip()


def extract_conversations(jsonl_path):
    """
    Extract clean user/assistant turn pairs from a session JSONL file.
    
    Strategy: collect all messages in order. For each user turn with text,
    look ahead to find the next assistant turn that has a text block
    (the final reply after any tool calls). Pair them.
    """
    messages = []

    try:
        with open(jsonl_path) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    msg = json.loads(line)
                except:
                    continue

                if msg.get("type") != "message":
                    continue

                role = msg.get("message", {}).get("role")
                content = msg.get("message", {}).get("content", [])
                if not role or not content:
                    continue

                text = extract_text_from_content(content)
                messages.append({"role": role, "text": text})

    except Exception as e:
        print(f"  Error reading {jsonl_path}: {e}")
        return []

    # Build user→assistant pairs
    conversations = []
    i = 0
    while i < len(messages):
        msg = messages[i]

        # Find a user turn with real text
        if msg["role"] == "user":
            user_text = msg["text"]
            skip_phrases = ("HEARTBEAT_OK", "NO_REPLY", "Read HEARTBEAT.md",
                            "A new session was started", "[Queued messages")
            if len(user_text) < 15 or any(user_text.startswith(p) for p in skip_phrases):
                i += 1
                continue

            # Look ahead for the next assistant turn with text
            j = i + 1
            while j < len(messages):
                if messages[j]["role"] == "assistant" and len(messages[j]["text"]) > 30:
                    asst_text = messages[j]["text"]
                    bad = ("HEARTBEAT_OK", "NO_REPLY")
                    if not any(asst_text.startswith(b) for b in bad):
                        conversations.append({"from": "human", "value": user_text})
                        conversations.append({"from": "gpt", "value": asst_text})
                    i = j + 1
                    break
                j += 1
            else:
                i += 1
        else:
            i += 1

    return conversations

def session_to_example(jsonl_path):
    convs = extract_conversations(jsonl_path)
    if len(convs) < 2:
        return None
    return {
        "conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + convs,
        "source": os.path.basename(jsonl_path),
        "exported_at": datetime.utcnow().isoformat()
    }

def main():
    state = load_state()
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    session_files = sorted([
        f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
        if ".reset." not in f
    ])

    new_examples = []
    new_sessions = []

    for sf in session_files:
        sid = os.path.basename(sf).replace(".jsonl", "")
        if sid in state["exported_sessions"]:
            continue

        print(f"Processing: {sid}")
        example = session_to_example(sf)

        if example and len(example["conversations"]) >= 4:
            turns = (len(example["conversations"]) - 1) // 2
            print(f"  → {turns} turns")
            new_examples.append(example)
            new_sessions.append(sid)
        else:
            print(f"  → skipped (too short)")

    if not new_examples:
        print("No new sessions to export.")
        return

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    output_file = os.path.join(OUTPUT_DIR, f"grace_training_{timestamp}.jsonl")

    with open(output_file, "w") as f:
        for ex in new_examples:
            f.write(json.dumps(ex) + "\n")

    state["exported_sessions"].extend(new_sessions)
    state["last_run"] = datetime.utcnow().isoformat()
    state["total_examples"] = state.get("total_examples", 0) + len(new_examples)
    save_state(state)

    print(f"\nWrote {len(new_examples)} examples → {output_file}")
    print(f"Total examples to date: {state['total_examples']}")

if __name__ == "__main__":
    main()