feat: initial import of all helper scripts from ~/scripts/
- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
This commit is contained in:
270
extract-sessions.py
Normal file
270
extract-sessions.py
Normal file
@@ -0,0 +1,270 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Improved session extractor that handles tool-interleaved conversations.
|
||||
|
||||
Session structure in OpenClaw JSONL:
|
||||
user: question (text)
|
||||
assistant: narration + toolCall blocks <- intermediate, not training data
|
||||
user: toolResult blocks <- intermediate
|
||||
assistant: final answer (text) <- THIS is what we want
|
||||
|
||||
Strategy: for each user text turn, find the LAST assistant text turn
|
||||
before the next user text turn. That's the real response.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import glob
|
||||
from datetime import datetime, timezone
|
||||
|
||||
# Noise patterns to strip from raw user messages
|
||||
NOISE_PATTERNS = [
|
||||
re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
||||
re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
||||
re.compile(r'<relevant-memories>.*?</relevant-memories>', re.DOTALL),
|
||||
re.compile(r'\[media attached:.*?\]'),
|
||||
re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL),
|
||||
re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL),
|
||||
re.compile(r'\[Queued messages while agent was busy\].*?(?=\n\n|\Z)', re.DOTALL),
|
||||
re.compile(r'---\s*\nQueued #\d+\s*\n'),
|
||||
re.compile(r'^```\s*\n```\s*\n', re.MULTILINE), # empty code blocks
|
||||
]
|
||||
|
||||
|
||||
def clean_user_text(text: str) -> str:
|
||||
for pattern in NOISE_PATTERNS:
|
||||
text = pattern.sub("", text)
|
||||
text = re.sub(r'\n{3,}', '\n\n', text).strip()
|
||||
# Strip leading/trailing backtick remnants
|
||||
text = re.sub(r'^[\s`]+', '', text).strip()
|
||||
return text
|
||||
|
||||
SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
|
||||
OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/jsonl"
|
||||
STATE_FILE = os.path.expanduser("~/self-improving/export-state.json")
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. "
|
||||
"You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
|
||||
"manage his homelab, help with job searching, and operate local AI infrastructure. "
|
||||
"You speak plainly — no corporate pleasantries, no hedging. "
|
||||
"You use exec and local tools proactively and return real results. Never fabricate output."
|
||||
)
|
||||
|
||||
SKIP_USER = (
|
||||
"HEARTBEAT_OK", "NO_REPLY", "Read HEARTBEAT.md",
|
||||
"A new session was started", "[Queued messages",
|
||||
"Run your Session Startup", "Conversation info (untrusted",
|
||||
)
|
||||
|
||||
SKIP_ASSISTANT = (
|
||||
"HEARTBEAT_OK", "NO_REPLY",
|
||||
"✅ New session started",
|
||||
)
|
||||
|
||||
MIN_USER_CHARS = 15
|
||||
MIN_ASST_CHARS = 40
|
||||
|
||||
|
||||
def get_text(content: list) -> str:
|
||||
parts = []
|
||||
for block in content:
|
||||
if isinstance(block, dict) and block.get("type") == "text":
|
||||
parts.append(block.get("text", ""))
|
||||
elif isinstance(block, str):
|
||||
parts.append(block)
|
||||
return " ".join(parts).strip()
|
||||
|
||||
|
||||
def has_tool_result(content: list) -> bool:
|
||||
"""Check if this message is a tool result (not a real user message)."""
|
||||
for b in content:
|
||||
if isinstance(b, dict):
|
||||
btype = b.get("type", "")
|
||||
if btype in ("toolResult", "tool_result"):
|
||||
return True
|
||||
# Also catches the pattern where the whole content is a tool result object
|
||||
if btype == "text":
|
||||
text = b.get("text", "")
|
||||
# Tool results often start with JSON or exec output
|
||||
if text.startswith('{\n "url":') or text.startswith('{\n "error":'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_tool_result_message(content: list) -> bool:
|
||||
"""
|
||||
Detect tool result messages: user messages that are purely tool output,
|
||||
not real human input. These have no real text — just raw tool return values.
|
||||
Heuristic: if content has text but it looks like structured tool output.
|
||||
"""
|
||||
raw = get_text(content)
|
||||
# Pure tool result patterns
|
||||
if raw.startswith('{\n "') and '"url"' in raw[:50]:
|
||||
return True
|
||||
if raw.startswith("---\nname:") or raw.startswith("# ") and len(raw) > 500:
|
||||
return True
|
||||
# Multi-line with no conversational content (exec output etc.)
|
||||
lines = raw.strip().split('\n')
|
||||
if len(lines) > 3 and not any(c.isalpha() and c.islower() for c in raw[:30]):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract_pairs(path: str):
|
||||
"""
|
||||
Parse session, return list of (user_text, assistant_text) pairs.
|
||||
Each pair = real human question + final assistant answer (after tool use).
|
||||
"""
|
||||
messages = []
|
||||
try:
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
msg = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
if msg.get("type") != "message":
|
||||
continue
|
||||
role = msg.get("message", {}).get("role")
|
||||
content = msg.get("message", {}).get("content", [])
|
||||
if role and content:
|
||||
messages.append({"role": role, "content": content})
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
return []
|
||||
|
||||
# Group into segments: each starts with a user text turn
|
||||
# A "user text turn" is a user message with actual text (not just toolResults)
|
||||
pairs = []
|
||||
i = 0
|
||||
while i < len(messages):
|
||||
msg = messages[i]
|
||||
|
||||
# Find a user turn with real text
|
||||
if msg["role"] != "user":
|
||||
i += 1
|
||||
continue
|
||||
|
||||
user_text = get_text(msg["content"])
|
||||
is_tool_result = has_tool_result(msg["content"])
|
||||
|
||||
if is_tool_result:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Clean metadata noise from user text
|
||||
user_text = clean_user_text(user_text)
|
||||
|
||||
if len(user_text) < MIN_USER_CHARS:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if any(user_text.startswith(s) for s in SKIP_USER):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Look forward: collect all assistant text turns until the next real user turn
|
||||
# The LAST non-empty assistant text before the next real user = the answer
|
||||
j = i + 1
|
||||
last_asst_text = None
|
||||
next_real_user = None
|
||||
|
||||
while j < len(messages):
|
||||
m = messages[j]
|
||||
if m["role"] == "user":
|
||||
u_text = get_text(m["content"])
|
||||
u_is_tool = has_tool_result(m["content"])
|
||||
if not u_is_tool and len(u_text) >= MIN_USER_CHARS and not any(u_text.startswith(s) for s in SKIP_USER):
|
||||
next_real_user = j
|
||||
break
|
||||
elif m["role"] == "assistant":
|
||||
t = get_text(m["content"])
|
||||
if len(t) >= MIN_ASST_CHARS and not any(t.startswith(s) for s in SKIP_ASSISTANT):
|
||||
last_asst_text = t
|
||||
j += 1
|
||||
|
||||
if last_asst_text:
|
||||
pairs.append((user_text, last_asst_text))
|
||||
|
||||
# Jump to next real user turn
|
||||
i = next_real_user if next_real_user is not None else j
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def load_state():
|
||||
if os.path.exists(STATE_FILE):
|
||||
with open(STATE_FILE) as f:
|
||||
return json.load(f)
|
||||
return {"exported_sessions": [], "last_run": None, "total_examples": 0}
|
||||
|
||||
|
||||
def save_state(state):
|
||||
with open(STATE_FILE, "w") as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
|
||||
def main():
|
||||
state = load_state()
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
session_files = sorted([
|
||||
f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
|
||||
if ".reset." not in f
|
||||
])
|
||||
|
||||
new_examples = []
|
||||
new_sessions = []
|
||||
|
||||
for sf in session_files:
|
||||
sid = os.path.basename(sf).replace(".jsonl", "")
|
||||
if sid in state["exported_sessions"]:
|
||||
continue
|
||||
|
||||
pairs = extract_pairs(sf)
|
||||
if len(pairs) < 2:
|
||||
print(f" {sid[:8]}: skipped ({len(pairs)} pairs)")
|
||||
state["exported_sessions"].append(sid)
|
||||
continue
|
||||
|
||||
conversations = [{"from": "system", "value": SYSTEM_PROMPT}]
|
||||
for user_text, asst_text in pairs:
|
||||
conversations.append({"from": "human", "value": user_text})
|
||||
conversations.append({"from": "gpt", "value": asst_text})
|
||||
|
||||
new_examples.append({
|
||||
"conversations": conversations,
|
||||
"source": os.path.basename(sf),
|
||||
"exported_at": datetime.now(timezone.utc).isoformat(),
|
||||
})
|
||||
new_sessions.append(sid)
|
||||
print(f" {sid[:8]}: {len(pairs)} pairs ✓")
|
||||
|
||||
if not new_examples:
|
||||
print("No new sessions to export.")
|
||||
state["exported_sessions"].extend(new_sessions)
|
||||
save_state(state)
|
||||
return
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
output_file = os.path.join(OUTPUT_DIR, f"grace_training_{timestamp}.jsonl")
|
||||
with open(output_file, "w") as f:
|
||||
for ex in new_examples:
|
||||
f.write(json.dumps(ex) + "\n")
|
||||
|
||||
state["exported_sessions"].extend(new_sessions)
|
||||
state["last_run"] = datetime.now(timezone.utc).isoformat()
|
||||
state["total_examples"] = state.get("total_examples", 0) + len(new_examples)
|
||||
save_state(state)
|
||||
|
||||
print(f"\nWrote {len(new_examples)} examples → {output_file}")
|
||||
print(f"Total examples to date: {state['total_examples']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user