- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
238 lines
7.7 KiB
Python
Executable File
238 lines
7.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Verbatim transcript exporter for Grace training data.
|
|
|
|
Reads all OpenClaw session JSONL files and saves clean, full-conversation
|
|
transcripts to NFS. Strips metadata envelopes from user messages but
|
|
preserves all content verbatim.
|
|
|
|
Output format: ShareGPT JSONL (system + alternating human/gpt turns)
|
|
Storage: /mnt/ai-storage/grace/training-data/transcripts/
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import glob
|
|
from datetime import datetime, timezone
|
|
|
|
SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
|
|
OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/transcripts"
|
|
STATE_FILE = os.path.expanduser("~/self-improving/transcript-state.json")
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell Burton. "
|
|
"You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
|
|
"manage his homelab, help with job searching, and operate local AI infrastructure. "
|
|
"You speak plainly — no corporate pleasantries, no hedging. "
|
|
"You use exec and local tools proactively and return real results. Never fabricate output."
|
|
)
|
|
|
|
# Noise to strip from user turns (metadata envelopes, not content)
|
|
NOISE_PATTERNS = [
|
|
re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
|
re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
|
re.compile(r'<relevant-memories>.*?</relevant-memories>\s*\n?', re.DOTALL),
|
|
re.compile(r'\[media attached:.*?\]\s*\n?'),
|
|
re.compile(r'To send an image back, prefer the message tool.*?\n', re.DOTALL),
|
|
re.compile(r'```json\s*\{\s*"schema"\s*:.*?\}\s*```\s*\n?', re.DOTALL),
|
|
re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL),
|
|
re.compile(r'\[Queued messages while agent was busy\]\s*\n', re.DOTALL),
|
|
re.compile(r'---\s*\nQueued #\d+\s*\n'),
|
|
re.compile(r'^```\s*\n```\s*\n?', re.MULTILINE),
|
|
]
|
|
|
|
# Full turns to skip entirely
|
|
SKIP_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
|
|
SKIP_STARTSWITH = (
|
|
"A new session was started via /new or /reset.",
|
|
"Read HEARTBEAT.md if it exists",
|
|
"Run your Session Startup",
|
|
)
|
|
|
|
# Skip assistant turns that are just internal narration with no value
|
|
SKIP_ASST_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
|
|
SKIP_ASST_STARTSWITH = (
|
|
"✅ New session started",
|
|
)
|
|
|
|
|
|
def clean_user(text: str) -> str:
|
|
for p in NOISE_PATTERNS:
|
|
text = p.sub("", text)
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
text = re.sub(r'^[\s`]+', '', text)
|
|
return text.strip()
|
|
|
|
|
|
def get_text(content: list) -> str:
|
|
parts = []
|
|
for block in content:
|
|
if isinstance(block, dict) and block.get("type") == "text":
|
|
t = block.get("text", "").strip()
|
|
if t:
|
|
parts.append(t)
|
|
elif isinstance(block, str) and block.strip():
|
|
parts.append(block.strip())
|
|
return "\n".join(parts).strip()
|
|
|
|
|
|
def is_tool_result_msg(content: list) -> bool:
|
|
"""True if this user message is a tool result, not a human turn."""
|
|
return any(
|
|
isinstance(b, dict) and b.get("type") in ("toolResult", "tool_result")
|
|
for b in content
|
|
)
|
|
|
|
|
|
def extract_transcript(path: str) -> list:
|
|
"""
|
|
Extract full verbatim conversation as list of {from, value} dicts.
|
|
Preserves all turns — doesn't filter by quality or length.
|
|
Only removes metadata noise and skips tool result messages.
|
|
"""
|
|
turns = []
|
|
|
|
try:
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
msg = json.loads(line)
|
|
except Exception:
|
|
continue
|
|
|
|
if msg.get("type") != "message":
|
|
continue
|
|
|
|
role = msg.get("message", {}).get("role")
|
|
content = msg.get("message", {}).get("content", [])
|
|
|
|
if not role or not content:
|
|
continue
|
|
|
|
if role == "user":
|
|
# Skip tool result messages
|
|
if is_tool_result_msg(content):
|
|
continue
|
|
|
|
text = clean_user(get_text(content))
|
|
|
|
if not text:
|
|
continue
|
|
if text in SKIP_EXACT:
|
|
continue
|
|
if any(text.startswith(s) for s in SKIP_STARTSWITH):
|
|
continue
|
|
|
|
turns.append({"from": "human", "value": text})
|
|
|
|
elif role == "assistant":
|
|
text = get_text(content)
|
|
|
|
if not text:
|
|
continue
|
|
if text in SKIP_ASST_EXACT:
|
|
continue
|
|
if any(text.startswith(s) for s in SKIP_ASST_STARTSWITH):
|
|
continue
|
|
|
|
turns.append({"from": "gpt", "value": text})
|
|
|
|
except Exception as e:
|
|
print(f" Error reading {path}: {e}")
|
|
return []
|
|
|
|
# Ensure turns alternate properly (drop consecutive same-role turns,
|
|
# keeping the last assistant turn before a role switch)
|
|
clean = []
|
|
for turn in turns:
|
|
if clean and clean[-1]["from"] == turn["from"]:
|
|
if turn["from"] == "gpt":
|
|
# Replace with the later (more complete) assistant turn
|
|
clean[-1] = turn
|
|
# For consecutive user turns, keep both (queued messages)
|
|
else:
|
|
clean.append(turn)
|
|
else:
|
|
clean.append(turn)
|
|
|
|
return clean
|
|
|
|
|
|
def load_state():
|
|
if os.path.exists(STATE_FILE):
|
|
with open(STATE_FILE) as f:
|
|
return json.load(f)
|
|
return {"exported_sessions": [], "last_run": None, "total_turns": 0}
|
|
|
|
|
|
def save_state(state):
|
|
with open(STATE_FILE, "w") as f:
|
|
json.dump(state, f, indent=2)
|
|
|
|
|
|
def main():
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
state = load_state()
|
|
|
|
session_files = sorted([
|
|
f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
|
|
if ".reset." not in f
|
|
])
|
|
|
|
new_examples = []
|
|
new_sessions = []
|
|
total_new_turns = 0
|
|
|
|
for sf in session_files:
|
|
sid = os.path.basename(sf).replace(".jsonl", "")
|
|
if sid in state["exported_sessions"]:
|
|
continue
|
|
|
|
turns = extract_transcript(sf)
|
|
|
|
# Need at least 4 turns (2 exchanges) to be useful
|
|
if len(turns) < 4:
|
|
print(f" {sid[:8]}: skipped ({len(turns)} turns)")
|
|
state["exported_sessions"].append(sid)
|
|
continue
|
|
|
|
example = {
|
|
"conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + turns,
|
|
"source": os.path.basename(sf),
|
|
"turn_count": len(turns),
|
|
"exported_at": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
new_examples.append(example)
|
|
new_sessions.append(sid)
|
|
total_new_turns += len(turns)
|
|
print(f" {sid[:8]}: {len(turns)} turns ✓")
|
|
|
|
if not new_examples:
|
|
print("No new sessions to export.")
|
|
state["exported_sessions"].extend(new_sessions)
|
|
save_state(state)
|
|
return
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
output_file = os.path.join(OUTPUT_DIR, f"grace_transcript_{timestamp}.jsonl")
|
|
|
|
with open(output_file, "w") as f:
|
|
for ex in new_examples:
|
|
f.write(json.dumps(ex) + "\n")
|
|
|
|
state["exported_sessions"].extend(new_sessions)
|
|
state["last_run"] = datetime.now(timezone.utc).isoformat()
|
|
state["total_turns"] = state.get("total_turns", 0) + total_new_turns
|
|
save_state(state)
|
|
|
|
print(f"\nWrote {len(new_examples)} transcripts ({total_new_turns} turns) → {output_file}")
|
|
print(f"Total turns to date: {state['total_turns']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|