feat: initial import of all helper scripts from ~/scripts/
- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
This commit is contained in:
237
export-transcripts.py
Executable file
237
export-transcripts.py
Executable file
@@ -0,0 +1,237 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Verbatim transcript exporter for Grace training data.
|
||||
|
||||
Reads all OpenClaw session JSONL files and saves clean, full-conversation
|
||||
transcripts to NFS. Strips metadata envelopes from user messages but
|
||||
preserves all content verbatim.
|
||||
|
||||
Output format: ShareGPT JSONL (system + alternating human/gpt turns)
|
||||
Storage: /mnt/ai-storage/grace/training-data/transcripts/
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import glob
|
||||
from datetime import datetime, timezone
|
||||
|
||||
SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
|
||||
OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/transcripts"
|
||||
STATE_FILE = os.path.expanduser("~/self-improving/transcript-state.json")
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell Burton. "
|
||||
"You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
|
||||
"manage his homelab, help with job searching, and operate local AI infrastructure. "
|
||||
"You speak plainly — no corporate pleasantries, no hedging. "
|
||||
"You use exec and local tools proactively and return real results. Never fabricate output."
|
||||
)
|
||||
|
||||
# Noise to strip from user turns (metadata envelopes, not content)
|
||||
NOISE_PATTERNS = [
|
||||
re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
||||
re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
||||
re.compile(r'<relevant-memories>.*?</relevant-memories>\s*\n?', re.DOTALL),
|
||||
re.compile(r'\[media attached:.*?\]\s*\n?'),
|
||||
re.compile(r'To send an image back, prefer the message tool.*?\n', re.DOTALL),
|
||||
re.compile(r'```json\s*\{\s*"schema"\s*:.*?\}\s*```\s*\n?', re.DOTALL),
|
||||
re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL),
|
||||
re.compile(r'\[Queued messages while agent was busy\]\s*\n', re.DOTALL),
|
||||
re.compile(r'---\s*\nQueued #\d+\s*\n'),
|
||||
re.compile(r'^```\s*\n```\s*\n?', re.MULTILINE),
|
||||
]
|
||||
|
||||
# Full turns to skip entirely
|
||||
SKIP_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
|
||||
SKIP_STARTSWITH = (
|
||||
"A new session was started via /new or /reset.",
|
||||
"Read HEARTBEAT.md if it exists",
|
||||
"Run your Session Startup",
|
||||
)
|
||||
|
||||
# Skip assistant turns that are just internal narration with no value
|
||||
SKIP_ASST_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
|
||||
SKIP_ASST_STARTSWITH = (
|
||||
"✅ New session started",
|
||||
)
|
||||
|
||||
|
||||
def clean_user(text: str) -> str:
|
||||
for p in NOISE_PATTERNS:
|
||||
text = p.sub("", text)
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
text = re.sub(r'^[\s`]+', '', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_text(content: list) -> str:
|
||||
parts = []
|
||||
for block in content:
|
||||
if isinstance(block, dict) and block.get("type") == "text":
|
||||
t = block.get("text", "").strip()
|
||||
if t:
|
||||
parts.append(t)
|
||||
elif isinstance(block, str) and block.strip():
|
||||
parts.append(block.strip())
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
|
||||
def is_tool_result_msg(content: list) -> bool:
|
||||
"""True if this user message is a tool result, not a human turn."""
|
||||
return any(
|
||||
isinstance(b, dict) and b.get("type") in ("toolResult", "tool_result")
|
||||
for b in content
|
||||
)
|
||||
|
||||
|
||||
def extract_transcript(path: str) -> list:
|
||||
"""
|
||||
Extract full verbatim conversation as list of {from, value} dicts.
|
||||
Preserves all turns — doesn't filter by quality or length.
|
||||
Only removes metadata noise and skips tool result messages.
|
||||
"""
|
||||
turns = []
|
||||
|
||||
try:
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
msg = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if msg.get("type") != "message":
|
||||
continue
|
||||
|
||||
role = msg.get("message", {}).get("role")
|
||||
content = msg.get("message", {}).get("content", [])
|
||||
|
||||
if not role or not content:
|
||||
continue
|
||||
|
||||
if role == "user":
|
||||
# Skip tool result messages
|
||||
if is_tool_result_msg(content):
|
||||
continue
|
||||
|
||||
text = clean_user(get_text(content))
|
||||
|
||||
if not text:
|
||||
continue
|
||||
if text in SKIP_EXACT:
|
||||
continue
|
||||
if any(text.startswith(s) for s in SKIP_STARTSWITH):
|
||||
continue
|
||||
|
||||
turns.append({"from": "human", "value": text})
|
||||
|
||||
elif role == "assistant":
|
||||
text = get_text(content)
|
||||
|
||||
if not text:
|
||||
continue
|
||||
if text in SKIP_ASST_EXACT:
|
||||
continue
|
||||
if any(text.startswith(s) for s in SKIP_ASST_STARTSWITH):
|
||||
continue
|
||||
|
||||
turns.append({"from": "gpt", "value": text})
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error reading {path}: {e}")
|
||||
return []
|
||||
|
||||
# Ensure turns alternate properly (drop consecutive same-role turns,
|
||||
# keeping the last assistant turn before a role switch)
|
||||
clean = []
|
||||
for turn in turns:
|
||||
if clean and clean[-1]["from"] == turn["from"]:
|
||||
if turn["from"] == "gpt":
|
||||
# Replace with the later (more complete) assistant turn
|
||||
clean[-1] = turn
|
||||
# For consecutive user turns, keep both (queued messages)
|
||||
else:
|
||||
clean.append(turn)
|
||||
else:
|
||||
clean.append(turn)
|
||||
|
||||
return clean
|
||||
|
||||
|
||||
def load_state():
|
||||
if os.path.exists(STATE_FILE):
|
||||
with open(STATE_FILE) as f:
|
||||
return json.load(f)
|
||||
return {"exported_sessions": [], "last_run": None, "total_turns": 0}
|
||||
|
||||
|
||||
def save_state(state):
|
||||
with open(STATE_FILE, "w") as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
state = load_state()
|
||||
|
||||
session_files = sorted([
|
||||
f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
|
||||
if ".reset." not in f
|
||||
])
|
||||
|
||||
new_examples = []
|
||||
new_sessions = []
|
||||
total_new_turns = 0
|
||||
|
||||
for sf in session_files:
|
||||
sid = os.path.basename(sf).replace(".jsonl", "")
|
||||
if sid in state["exported_sessions"]:
|
||||
continue
|
||||
|
||||
turns = extract_transcript(sf)
|
||||
|
||||
# Need at least 4 turns (2 exchanges) to be useful
|
||||
if len(turns) < 4:
|
||||
print(f" {sid[:8]}: skipped ({len(turns)} turns)")
|
||||
state["exported_sessions"].append(sid)
|
||||
continue
|
||||
|
||||
example = {
|
||||
"conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + turns,
|
||||
"source": os.path.basename(sf),
|
||||
"turn_count": len(turns),
|
||||
"exported_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
new_examples.append(example)
|
||||
new_sessions.append(sid)
|
||||
total_new_turns += len(turns)
|
||||
print(f" {sid[:8]}: {len(turns)} turns ✓")
|
||||
|
||||
if not new_examples:
|
||||
print("No new sessions to export.")
|
||||
state["exported_sessions"].extend(new_sessions)
|
||||
save_state(state)
|
||||
return
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
output_file = os.path.join(OUTPUT_DIR, f"grace_transcript_{timestamp}.jsonl")
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
for ex in new_examples:
|
||||
f.write(json.dumps(ex) + "\n")
|
||||
|
||||
state["exported_sessions"].extend(new_sessions)
|
||||
state["last_run"] = datetime.now(timezone.utc).isoformat()
|
||||
state["total_turns"] = state.get("total_turns", 0) + total_new_turns
|
||||
save_state(state)
|
||||
|
||||
print(f"\nWrote {len(new_examples)} transcripts ({total_new_turns} turns) → {output_file}")
|
||||
print(f"Total turns to date: {state['total_turns']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user