feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db
- Infra tooling: infra-audit, infra-gitea-link
- RAG pipeline: rag-ingest, rag-query
- Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth
- Transcripts: export-transcripts
- Updated README with script index and token reduction strategy
This commit is contained in:
Grace
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions

240
convert-training-data.py Executable file
View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
Convert raw Grace session exports to clean unsloth/axolotl-ready JSONL.
Input: ~/training-data/jsonl/grace_training_*.jsonl (ShareGPT format, raw)
Output: ~/training-data/cleaned/grace_clean_YYYYMMDD.jsonl (clean ShareGPT)
~/training-data/dpo/grace_dpo_YYYYMMDD.jsonl (DPO pairs, if any)
Storage: copies to grace@192.168.20.87:~/training-data/cleaned/
"""
import json
import os
import re
import glob
import subprocess
from datetime import datetime, timezone
NFS_BASE = "/mnt/ai-storage/grace/training-data"
INPUT_DIR = os.path.join(NFS_BASE, "jsonl")
CLEAN_DIR = os.path.join(NFS_BASE, "cleaned")
DPO_DIR = os.path.join(NFS_BASE, "dpo")
STATE_FILE = os.path.expanduser("~/self-improving/convert-state.json")
# Noise patterns to strip from user turns
NOISE_PATTERNS = [
re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
re.compile(r'<relevant-memories>.*?</relevant-memories>', re.DOTALL),
re.compile(r'\[media attached:.*?\]'),
re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL),
]
# Minimum quality thresholds
MIN_TURNS = 2 # minimum user/assistant exchanges
MIN_ASSISTANT_CHARS = 80 # skip very short assistant replies
MAX_ASSISTANT_CHARS = 8000 # skip extremely long tool-dump responses
# Strings that indicate a low-quality assistant turn
BAD_PATTERNS = [
"HEARTBEAT_OK", "NO_REPLY",
"<tool_call>", # raw tool call leaked into response
"Let me check", # placeholder with no follow-through
]
SYSTEM_PROMPT = (
"You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. "
"You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
"manage his homelab, help with job searching, and operate local AI infrastructure. "
"You speak plainly — no corporate pleasantries, no hedging. "
"You use exec and local tools proactively and return real results. Never fabricate output."
)
def load_state():
if os.path.exists(STATE_FILE):
with open(STATE_FILE) as f:
return json.load(f)
return {"converted_files": [], "total_clean": 0, "total_dpo": 0}
def save_state(state):
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
def clean_text(text: str) -> str:
"""Strip metadata noise from a user turn."""
for pattern in NOISE_PATTERNS:
text = pattern.sub("", text)
# Collapse excess whitespace
text = re.sub(r'\n{3,}', '\n\n', text).strip()
return text
def is_good_assistant_turn(text: str) -> bool:
if len(text) < MIN_ASSISTANT_CHARS:
return False
if len(text) > MAX_ASSISTANT_CHARS:
return False
for bad in BAD_PATTERNS:
if bad in text:
return False
return True
def extract_dpo_pairs(conversations: list) -> list:
"""
Look for correction signals in the conversation.
A correction pair = assistant turn followed by a user message
that signals disagreement, then a better assistant turn.
Signals: "no,", "actually", "that's wrong", "you should", "remember"
"""
correction_signals = ["no,", "actually", "that's wrong", "you should have",
"remember that", "i told you", "stop doing", "wrong"]
pairs = []
convs = [c for c in conversations if c["from"] in ("human", "gpt")]
for i in range(1, len(convs) - 1):
if convs[i]["from"] != "human":
continue
user_text = convs[i]["value"].lower()
if not any(sig in user_text for sig in correction_signals):
continue
if i < 1 or convs[i-1]["from"] != "gpt":
continue
if i + 1 >= len(convs) or convs[i+1]["from"] != "gpt":
continue
rejected = convs[i-1]["value"]
chosen = convs[i+1]["value"]
# Only worth it if the chosen is meaningfully different
if len(chosen) > 50 and chosen != rejected:
pairs.append({
"prompt": convs[i-2]["value"] if i >= 2 else "",
"chosen": chosen,
"rejected": rejected,
"context": convs[i]["value"], # the correction itself
})
return pairs
def process_file(path: str):
"""
Process one raw export JSONL file.
Returns (clean_examples, dpo_pairs).
"""
clean_examples = []
dpo_pairs = []
with open(path) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
example = json.loads(line)
except json.JSONDecodeError:
continue
raw_convs = example.get("conversations", [])
# Separate system prompt from turns
turns = [c for c in raw_convs if c.get("from") != "system"]
# Clean user turns
cleaned = []
for turn in turns:
if turn["from"] == "human":
text = clean_text(turn["value"])
if len(text) < 5:
continue
cleaned.append({"from": "human", "value": text})
elif turn["from"] == "gpt":
text = turn["value"].strip()
if not is_good_assistant_turn(text):
continue
cleaned.append({"from": "gpt", "value": text})
# Build valid human/gpt pairs only
valid_turns = []
i = 0
while i < len(cleaned) - 1:
if cleaned[i]["from"] == "human" and cleaned[i+1]["from"] == "gpt":
valid_turns.append(cleaned[i])
valid_turns.append(cleaned[i+1])
i += 2
else:
i += 1
if len(valid_turns) < MIN_TURNS * 2:
continue
clean_examples.append({
"conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + valid_turns,
"source": os.path.basename(path),
"converted_at": datetime.now(timezone.utc).isoformat(),
})
# Extract DPO pairs from this example
dpo_pairs.extend(extract_dpo_pairs(valid_turns))
return clean_examples, dpo_pairs
def scp_file(local_path: str, remote_subdir: str):
# Data is on NFS — already written directly, no SCP needed
print(f" Written to NFS: {local_path}")
def main():
os.makedirs(CLEAN_DIR, exist_ok=True)
os.makedirs(DPO_DIR, exist_ok=True)
state = load_state()
input_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.jsonl")))
new_files = [f for f in input_files if os.path.basename(f) not in state["converted_files"]]
if not new_files:
print("No new files to convert.")
return
all_clean = []
all_dpo = []
for f in new_files:
print(f"Processing: {os.path.basename(f)}")
clean, dpo = process_file(f)
print(f"{len(clean)} clean examples, {len(dpo)} DPO pairs")
all_clean.extend(clean)
all_dpo.extend(dpo)
state["converted_files"].append(os.path.basename(f))
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
if all_clean:
clean_file = os.path.join(CLEAN_DIR, f"grace_clean_{timestamp}.jsonl")
with open(clean_file, "w") as f:
for ex in all_clean:
f.write(json.dumps(ex) + "\n")
print(f"\nWrote {len(all_clean)} clean examples → {clean_file}")
scp_file(clean_file, "cleaned")
state["total_clean"] = state.get("total_clean", 0) + len(all_clean)
if all_dpo:
dpo_file = os.path.join(DPO_DIR, f"grace_dpo_{timestamp}.jsonl")
with open(dpo_file, "w") as f:
for pair in all_dpo:
f.write(json.dumps(pair) + "\n")
print(f"Wrote {len(all_dpo)} DPO pairs → {dpo_file}")
scp_file(dpo_file, "dpo")
state["total_dpo"] = state.get("total_dpo", 0) + len(all_dpo)
save_state(state)
print(f"\nTotals to date: {state['total_clean']} clean, {state['total_dpo']} DPO pairs")
if __name__ == "__main__":
main()