feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db
- Infra tooling: infra-audit, infra-gitea-link
- RAG pipeline: rag-ingest, rag-query
- Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth
- Transcripts: export-transcripts
- Updated README with script index and token reduction strategy
This commit is contained in:
Grace
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions

180
export-training-data.py Executable file
View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
Export OpenClaw session JSONL files to training data format.
Uses session-logs skill structure to read raw sessions.
Outputs ShareGPT format JSONL for axolotl/unsloth fine-tuning.
Storage: grace@192.168.20.87:~/training-data/jsonl/
"""
import json
import os
import glob
import subprocess
from datetime import datetime
SESSIONS_DIR = os.path.expanduser("~/.openclaw/agents/main/sessions/")
OUTPUT_DIR = "/mnt/ai-storage/grace/training-data/jsonl"
STATE_FILE = os.path.expanduser("~/self-improving/export-state.json")
SYSTEM_PROMPT = """You are Grace, a Culture Mind-class AI assistant and cognitive partner for Maxwell. You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, manage his homelab, track job search tasks, and operate local AI infrastructure. You speak plainly — no corporate pleasantries, no hedging. You use tools proactively and report real results."""
def load_state():
if os.path.exists(STATE_FILE):
with open(STATE_FILE) as f:
return json.load(f)
return {"exported_sessions": [], "last_run": None, "total_examples": 0}
def save_state(state):
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
def extract_text_from_content(content):
"""
Extract displayable text from a content block list.
Handles: text blocks, toolResult blocks (for context), ignores thinking/toolCall.
Returns the final visible text only.
"""
text_parts = []
for block in content:
if not isinstance(block, dict):
if isinstance(block, str):
text_parts.append(block)
continue
btype = block.get("type", "")
if btype == "text":
text_parts.append(block.get("text", ""))
# Skip: thinking, toolCall, toolResult — these are internal plumbing
return " ".join(text_parts).strip()
def extract_conversations(jsonl_path):
"""
Extract clean user/assistant turn pairs from a session JSONL file.
Strategy: collect all messages in order. For each user turn with text,
look ahead to find the next assistant turn that has a text block
(the final reply after any tool calls). Pair them.
"""
messages = []
try:
with open(jsonl_path) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
msg = json.loads(line)
except:
continue
if msg.get("type") != "message":
continue
role = msg.get("message", {}).get("role")
content = msg.get("message", {}).get("content", [])
if not role or not content:
continue
text = extract_text_from_content(content)
messages.append({"role": role, "text": text})
except Exception as e:
print(f" Error reading {jsonl_path}: {e}")
return []
# Build user→assistant pairs
conversations = []
i = 0
while i < len(messages):
msg = messages[i]
# Find a user turn with real text
if msg["role"] == "user":
user_text = msg["text"]
skip_phrases = ("HEARTBEAT_OK", "NO_REPLY", "Read HEARTBEAT.md",
"A new session was started", "[Queued messages")
if len(user_text) < 15 or any(user_text.startswith(p) for p in skip_phrases):
i += 1
continue
# Look ahead for the next assistant turn with text
j = i + 1
while j < len(messages):
if messages[j]["role"] == "assistant" and len(messages[j]["text"]) > 30:
asst_text = messages[j]["text"]
bad = ("HEARTBEAT_OK", "NO_REPLY")
if not any(asst_text.startswith(b) for b in bad):
conversations.append({"from": "human", "value": user_text})
conversations.append({"from": "gpt", "value": asst_text})
i = j + 1
break
j += 1
else:
i += 1
else:
i += 1
return conversations
def session_to_example(jsonl_path):
convs = extract_conversations(jsonl_path)
if len(convs) < 2:
return None
return {
"conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + convs,
"source": os.path.basename(jsonl_path),
"exported_at": datetime.utcnow().isoformat()
}
def main():
state = load_state()
os.makedirs(OUTPUT_DIR, exist_ok=True)
session_files = sorted([
f for f in glob.glob(os.path.join(SESSIONS_DIR, "*.jsonl"))
if ".reset." not in f
])
new_examples = []
new_sessions = []
for sf in session_files:
sid = os.path.basename(sf).replace(".jsonl", "")
if sid in state["exported_sessions"]:
continue
print(f"Processing: {sid}")
example = session_to_example(sf)
if example and len(example["conversations"]) >= 4:
turns = (len(example["conversations"]) - 1) // 2
print(f"{turns} turns")
new_examples.append(example)
new_sessions.append(sid)
else:
print(f" → skipped (too short)")
if not new_examples:
print("No new sessions to export.")
return
os.makedirs(OUTPUT_DIR, exist_ok=True)
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(OUTPUT_DIR, f"grace_training_{timestamp}.jsonl")
with open(output_file, "w") as f:
for ex in new_examples:
f.write(json.dumps(ex) + "\n")
state["exported_sessions"].extend(new_sessions)
state["last_run"] = datetime.utcnow().isoformat()
state["total_examples"] = state.get("total_examples", 0) + len(new_examples)
save_state(state)
print(f"\nWrote {len(new_examples)} examples → {output_file}")
print(f"Total examples to date: {state['total_examples']}")
if __name__ == "__main__":
main()