feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db
- Infra tooling: infra-audit, infra-gitea-link
- RAG pipeline: rag-ingest, rag-query
- Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth
- Transcripts: export-transcripts
- Updated README with script index and token reduction strategy
This commit is contained in:
Grace
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions

237
export-transcripts.py Executable file
View File

@@ -0,0 +1,237 @@
#!/usr/bin/env python3
"""
Verbatim transcript exporter for Grace training data.
Reads all OpenClaw session JSONL files and saves clean, full-conversation
transcripts to NFS. Strips metadata envelopes from user messages but
preserves all content verbatim.
Output format: ShareGPT JSONL (system + alternating human/gpt turns)
Storage: /mnt/ai-storage/grace/training-data/transcripts/
"""
import json
import os
import re
import glob
from datetime import datetime, timezone
SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/transcripts"
STATE_FILE = os.path.expanduser("~/self-improving/transcript-state.json")
SYSTEM_PROMPT = (
"You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell Burton. "
"You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
"manage his homelab, help with job searching, and operate local AI infrastructure. "
"You speak plainly — no corporate pleasantries, no hedging. "
"You use exec and local tools proactively and return real results. Never fabricate output."
)
# Noise to strip from user turns (metadata envelopes, not content)
NOISE_PATTERNS = [
re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
re.compile(r'<relevant-memories>.*?</relevant-memories>\s*\n?', re.DOTALL),
re.compile(r'\[media attached:.*?\]\s*\n?'),
re.compile(r'To send an image back, prefer the message tool.*?\n', re.DOTALL),
re.compile(r'```json\s*\{\s*"schema"\s*:.*?\}\s*```\s*\n?', re.DOTALL),
re.compile(r'Replied message \(untrusted.*?\}\s*\n\s*\}\s*\n', re.DOTALL),
re.compile(r'\[Queued messages while agent was busy\]\s*\n', re.DOTALL),
re.compile(r'---\s*\nQueued #\d+\s*\n'),
re.compile(r'^```\s*\n```\s*\n?', re.MULTILINE),
]
# Full turns to skip entirely
SKIP_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
SKIP_STARTSWITH = (
"A new session was started via /new or /reset.",
"Read HEARTBEAT.md if it exists",
"Run your Session Startup",
)
# Skip assistant turns that are just internal narration with no value
SKIP_ASST_EXACT = {"HEARTBEAT_OK", "NO_REPLY"}
SKIP_ASST_STARTSWITH = (
"✅ New session started",
)
def clean_user(text: str) -> str:
for p in NOISE_PATTERNS:
text = p.sub("", text)
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r'^[\s`]+', '', text)
return text.strip()
def get_text(content: list) -> str:
parts = []
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
t = block.get("text", "").strip()
if t:
parts.append(t)
elif isinstance(block, str) and block.strip():
parts.append(block.strip())
return "\n".join(parts).strip()
def is_tool_result_msg(content: list) -> bool:
"""True if this user message is a tool result, not a human turn."""
return any(
isinstance(b, dict) and b.get("type") in ("toolResult", "tool_result")
for b in content
)
def extract_transcript(path: str) -> list:
"""
Extract full verbatim conversation as list of {from, value} dicts.
Preserves all turns — doesn't filter by quality or length.
Only removes metadata noise and skips tool result messages.
"""
turns = []
try:
with open(path) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
msg = json.loads(line)
except Exception:
continue
if msg.get("type") != "message":
continue
role = msg.get("message", {}).get("role")
content = msg.get("message", {}).get("content", [])
if not role or not content:
continue
if role == "user":
# Skip tool result messages
if is_tool_result_msg(content):
continue
text = clean_user(get_text(content))
if not text:
continue
if text in SKIP_EXACT:
continue
if any(text.startswith(s) for s in SKIP_STARTSWITH):
continue
turns.append({"from": "human", "value": text})
elif role == "assistant":
text = get_text(content)
if not text:
continue
if text in SKIP_ASST_EXACT:
continue
if any(text.startswith(s) for s in SKIP_ASST_STARTSWITH):
continue
turns.append({"from": "gpt", "value": text})
except Exception as e:
print(f" Error reading {path}: {e}")
return []
# Ensure turns alternate properly (drop consecutive same-role turns,
# keeping the last assistant turn before a role switch)
clean = []
for turn in turns:
if clean and clean[-1]["from"] == turn["from"]:
if turn["from"] == "gpt":
# Replace with the later (more complete) assistant turn
clean[-1] = turn
# For consecutive user turns, keep both (queued messages)
else:
clean.append(turn)
else:
clean.append(turn)
return clean
def load_state():
if os.path.exists(STATE_FILE):
with open(STATE_FILE) as f:
return json.load(f)
return {"exported_sessions": [], "last_run": None, "total_turns": 0}
def save_state(state):
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
state = load_state()
session_files = sorted([
f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
if ".reset." not in f
])
new_examples = []
new_sessions = []
total_new_turns = 0
for sf in session_files:
sid = os.path.basename(sf).replace(".jsonl", "")
if sid in state["exported_sessions"]:
continue
turns = extract_transcript(sf)
# Need at least 4 turns (2 exchanges) to be useful
if len(turns) < 4:
print(f" {sid[:8]}: skipped ({len(turns)} turns)")
state["exported_sessions"].append(sid)
continue
example = {
"conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + turns,
"source": os.path.basename(sf),
"turn_count": len(turns),
"exported_at": datetime.now(timezone.utc).isoformat(),
}
new_examples.append(example)
new_sessions.append(sid)
total_new_turns += len(turns)
print(f" {sid[:8]}: {len(turns)} turns ✓")
if not new_examples:
print("No new sessions to export.")
state["exported_sessions"].extend(new_sessions)
save_state(state)
return
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(OUTPUT_DIR, f"grace_transcript_{timestamp}.jsonl")
with open(output_file, "w") as f:
for ex in new_examples:
f.write(json.dumps(ex) + "\n")
state["exported_sessions"].extend(new_sessions)
state["last_run"] = datetime.now(timezone.utc).isoformat()
state["total_turns"] = state.get("total_turns", 0) + total_new_turns
save_state(state)
print(f"\nWrote {len(new_examples)} transcripts ({total_new_turns} turns) → {output_file}")
print(f"Total turns to date: {state['total_turns']}")
if __name__ == "__main__":
main()