feat: initial import of all helper scripts from ~/scripts/
- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
This commit is contained in:
240
convert-training-data.py
Executable file
240
convert-training-data.py
Executable file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert raw Grace session exports to clean unsloth/axolotl-ready JSONL.
|
||||
|
||||
Input: ~/training-data/jsonl/grace_training_*.jsonl (ShareGPT format, raw)
|
||||
Output: ~/training-data/cleaned/grace_clean_YYYYMMDD.jsonl (clean ShareGPT)
|
||||
~/training-data/dpo/grace_dpo_YYYYMMDD.jsonl (DPO pairs, if any)
|
||||
|
||||
Storage: copies to grace@192.168.20.87:~/training-data/cleaned/
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import glob
|
||||
import subprocess
|
||||
from datetime import datetime, timezone
|
||||
|
||||
NFS_BASE = "/mnt/ai-storage/grace/training-data"
|
||||
INPUT_DIR = os.path.join(NFS_BASE, "jsonl")
|
||||
CLEAN_DIR = os.path.join(NFS_BASE, "cleaned")
|
||||
DPO_DIR = os.path.join(NFS_BASE, "dpo")
|
||||
STATE_FILE = os.path.expanduser("~/self-improving/convert-state.json")
|
||||
|
||||
# Noise patterns to strip from user turns
|
||||
NOISE_PATTERNS = [
|
||||
re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
||||
re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
||||
re.compile(r'<relevant-memories>.*?</relevant-memories>', re.DOTALL),
|
||||
re.compile(r'\[media attached:.*?\]'),
|
||||
re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL),
|
||||
]
|
||||
|
||||
# Minimum quality thresholds
|
||||
MIN_TURNS = 2 # minimum user/assistant exchanges
|
||||
MIN_ASSISTANT_CHARS = 80 # skip very short assistant replies
|
||||
MAX_ASSISTANT_CHARS = 8000 # skip extremely long tool-dump responses
|
||||
|
||||
# Strings that indicate a low-quality assistant turn
|
||||
BAD_PATTERNS = [
|
||||
"HEARTBEAT_OK", "NO_REPLY",
|
||||
"<tool_call>", # raw tool call leaked into response
|
||||
"Let me check", # placeholder with no follow-through
|
||||
]
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. "
|
||||
"You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
|
||||
"manage his homelab, help with job searching, and operate local AI infrastructure. "
|
||||
"You speak plainly — no corporate pleasantries, no hedging. "
|
||||
"You use exec and local tools proactively and return real results. Never fabricate output."
|
||||
)
|
||||
|
||||
|
||||
def load_state():
|
||||
if os.path.exists(STATE_FILE):
|
||||
with open(STATE_FILE) as f:
|
||||
return json.load(f)
|
||||
return {"converted_files": [], "total_clean": 0, "total_dpo": 0}
|
||||
|
||||
|
||||
def save_state(state):
|
||||
with open(STATE_FILE, "w") as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Strip metadata noise from a user turn."""
|
||||
for pattern in NOISE_PATTERNS:
|
||||
text = pattern.sub("", text)
|
||||
# Collapse excess whitespace
|
||||
text = re.sub(r'\n{3,}', '\n\n', text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def is_good_assistant_turn(text: str) -> bool:
|
||||
if len(text) < MIN_ASSISTANT_CHARS:
|
||||
return False
|
||||
if len(text) > MAX_ASSISTANT_CHARS:
|
||||
return False
|
||||
for bad in BAD_PATTERNS:
|
||||
if bad in text:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def extract_dpo_pairs(conversations: list) -> list:
|
||||
"""
|
||||
Look for correction signals in the conversation.
|
||||
A correction pair = assistant turn followed by a user message
|
||||
that signals disagreement, then a better assistant turn.
|
||||
Signals: "no,", "actually", "that's wrong", "you should", "remember"
|
||||
"""
|
||||
correction_signals = ["no,", "actually", "that's wrong", "you should have",
|
||||
"remember that", "i told you", "stop doing", "wrong"]
|
||||
pairs = []
|
||||
convs = [c for c in conversations if c["from"] in ("human", "gpt")]
|
||||
|
||||
for i in range(1, len(convs) - 1):
|
||||
if convs[i]["from"] != "human":
|
||||
continue
|
||||
user_text = convs[i]["value"].lower()
|
||||
if not any(sig in user_text for sig in correction_signals):
|
||||
continue
|
||||
if i < 1 or convs[i-1]["from"] != "gpt":
|
||||
continue
|
||||
if i + 1 >= len(convs) or convs[i+1]["from"] != "gpt":
|
||||
continue
|
||||
|
||||
rejected = convs[i-1]["value"]
|
||||
chosen = convs[i+1]["value"]
|
||||
|
||||
# Only worth it if the chosen is meaningfully different
|
||||
if len(chosen) > 50 and chosen != rejected:
|
||||
pairs.append({
|
||||
"prompt": convs[i-2]["value"] if i >= 2 else "",
|
||||
"chosen": chosen,
|
||||
"rejected": rejected,
|
||||
"context": convs[i]["value"], # the correction itself
|
||||
})
|
||||
return pairs
|
||||
|
||||
|
||||
def process_file(path: str):
|
||||
"""
|
||||
Process one raw export JSONL file.
|
||||
Returns (clean_examples, dpo_pairs).
|
||||
"""
|
||||
clean_examples = []
|
||||
dpo_pairs = []
|
||||
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
example = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
raw_convs = example.get("conversations", [])
|
||||
|
||||
# Separate system prompt from turns
|
||||
turns = [c for c in raw_convs if c.get("from") != "system"]
|
||||
|
||||
# Clean user turns
|
||||
cleaned = []
|
||||
for turn in turns:
|
||||
if turn["from"] == "human":
|
||||
text = clean_text(turn["value"])
|
||||
if len(text) < 5:
|
||||
continue
|
||||
cleaned.append({"from": "human", "value": text})
|
||||
elif turn["from"] == "gpt":
|
||||
text = turn["value"].strip()
|
||||
if not is_good_assistant_turn(text):
|
||||
continue
|
||||
cleaned.append({"from": "gpt", "value": text})
|
||||
|
||||
# Build valid human/gpt pairs only
|
||||
valid_turns = []
|
||||
i = 0
|
||||
while i < len(cleaned) - 1:
|
||||
if cleaned[i]["from"] == "human" and cleaned[i+1]["from"] == "gpt":
|
||||
valid_turns.append(cleaned[i])
|
||||
valid_turns.append(cleaned[i+1])
|
||||
i += 2
|
||||
else:
|
||||
i += 1
|
||||
|
||||
if len(valid_turns) < MIN_TURNS * 2:
|
||||
continue
|
||||
|
||||
clean_examples.append({
|
||||
"conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + valid_turns,
|
||||
"source": os.path.basename(path),
|
||||
"converted_at": datetime.now(timezone.utc).isoformat(),
|
||||
})
|
||||
|
||||
# Extract DPO pairs from this example
|
||||
dpo_pairs.extend(extract_dpo_pairs(valid_turns))
|
||||
|
||||
return clean_examples, dpo_pairs
|
||||
|
||||
|
||||
def scp_file(local_path: str, remote_subdir: str):
|
||||
# Data is on NFS — already written directly, no SCP needed
|
||||
print(f" Written to NFS: {local_path}")
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(CLEAN_DIR, exist_ok=True)
|
||||
os.makedirs(DPO_DIR, exist_ok=True)
|
||||
|
||||
state = load_state()
|
||||
input_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.jsonl")))
|
||||
new_files = [f for f in input_files if os.path.basename(f) not in state["converted_files"]]
|
||||
|
||||
if not new_files:
|
||||
print("No new files to convert.")
|
||||
return
|
||||
|
||||
all_clean = []
|
||||
all_dpo = []
|
||||
|
||||
for f in new_files:
|
||||
print(f"Processing: {os.path.basename(f)}")
|
||||
clean, dpo = process_file(f)
|
||||
print(f" → {len(clean)} clean examples, {len(dpo)} DPO pairs")
|
||||
all_clean.extend(clean)
|
||||
all_dpo.extend(dpo)
|
||||
state["converted_files"].append(os.path.basename(f))
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
if all_clean:
|
||||
clean_file = os.path.join(CLEAN_DIR, f"grace_clean_{timestamp}.jsonl")
|
||||
with open(clean_file, "w") as f:
|
||||
for ex in all_clean:
|
||||
f.write(json.dumps(ex) + "\n")
|
||||
print(f"\nWrote {len(all_clean)} clean examples → {clean_file}")
|
||||
scp_file(clean_file, "cleaned")
|
||||
state["total_clean"] = state.get("total_clean", 0) + len(all_clean)
|
||||
|
||||
if all_dpo:
|
||||
dpo_file = os.path.join(DPO_DIR, f"grace_dpo_{timestamp}.jsonl")
|
||||
with open(dpo_file, "w") as f:
|
||||
for pair in all_dpo:
|
||||
f.write(json.dumps(pair) + "\n")
|
||||
print(f"Wrote {len(all_dpo)} DPO pairs → {dpo_file}")
|
||||
scp_file(dpo_file, "dpo")
|
||||
state["total_dpo"] = state.get("total_dpo", 0) + len(all_dpo)
|
||||
|
||||
save_state(state)
|
||||
print(f"\nTotals to date: {state['total_clean']} clean, {state['total_dpo']} DPO pairs")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user