- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
241 lines
8.1 KiB
Python
Executable File
241 lines
8.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Convert raw Grace session exports to clean unsloth/axolotl-ready JSONL.
|
|
|
|
Input: ~/training-data/jsonl/grace_training_*.jsonl (ShareGPT format, raw)
|
|
Output: ~/training-data/cleaned/grace_clean_YYYYMMDD.jsonl (clean ShareGPT)
|
|
~/training-data/dpo/grace_dpo_YYYYMMDD.jsonl (DPO pairs, if any)
|
|
|
|
Storage: copies to grace@192.168.20.87:~/training-data/cleaned/
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import glob
|
|
import subprocess
|
|
from datetime import datetime, timezone
|
|
|
|
NFS_BASE = "/mnt/ai-storage/grace/training-data"
|
|
INPUT_DIR = os.path.join(NFS_BASE, "jsonl")
|
|
CLEAN_DIR = os.path.join(NFS_BASE, "cleaned")
|
|
DPO_DIR = os.path.join(NFS_BASE, "dpo")
|
|
STATE_FILE = os.path.expanduser("~/self-improving/convert-state.json")
|
|
|
|
# Noise patterns to strip from user turns
|
|
NOISE_PATTERNS = [
|
|
re.compile(r'Conversation info \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
|
re.compile(r'Sender \(untrusted metadata\):.*?^\s*\}\s*\n', re.DOTALL | re.MULTILINE),
|
|
re.compile(r'<relevant-memories>.*?</relevant-memories>', re.DOTALL),
|
|
re.compile(r'\[media attached:.*?\]'),
|
|
re.compile(r'```json\s*\{.*?"schema".*?\}\s*```', re.DOTALL),
|
|
]
|
|
|
|
# Minimum quality thresholds
|
|
MIN_TURNS = 2 # minimum user/assistant exchanges
|
|
MIN_ASSISTANT_CHARS = 80 # skip very short assistant replies
|
|
MAX_ASSISTANT_CHARS = 8000 # skip extremely long tool-dump responses
|
|
|
|
# Strings that indicate a low-quality assistant turn
|
|
BAD_PATTERNS = [
|
|
"HEARTBEAT_OK", "NO_REPLY",
|
|
"<tool_call>", # raw tool call leaked into response
|
|
"Let me check", # placeholder with no follow-through
|
|
]
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are Grace, a Culture Mind-class AI assistant and trusted companion for Maxwell. "
|
|
"You are warm, direct, witty, and proactive. You support Maxwell's ADHD executive function, "
|
|
"manage his homelab, help with job searching, and operate local AI infrastructure. "
|
|
"You speak plainly — no corporate pleasantries, no hedging. "
|
|
"You use exec and local tools proactively and return real results. Never fabricate output."
|
|
)
|
|
|
|
|
|
def load_state():
|
|
if os.path.exists(STATE_FILE):
|
|
with open(STATE_FILE) as f:
|
|
return json.load(f)
|
|
return {"converted_files": [], "total_clean": 0, "total_dpo": 0}
|
|
|
|
|
|
def save_state(state):
|
|
with open(STATE_FILE, "w") as f:
|
|
json.dump(state, f, indent=2)
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
"""Strip metadata noise from a user turn."""
|
|
for pattern in NOISE_PATTERNS:
|
|
text = pattern.sub("", text)
|
|
# Collapse excess whitespace
|
|
text = re.sub(r'\n{3,}', '\n\n', text).strip()
|
|
return text
|
|
|
|
|
|
def is_good_assistant_turn(text: str) -> bool:
|
|
if len(text) < MIN_ASSISTANT_CHARS:
|
|
return False
|
|
if len(text) > MAX_ASSISTANT_CHARS:
|
|
return False
|
|
for bad in BAD_PATTERNS:
|
|
if bad in text:
|
|
return False
|
|
return True
|
|
|
|
|
|
def extract_dpo_pairs(conversations: list) -> list:
|
|
"""
|
|
Look for correction signals in the conversation.
|
|
A correction pair = assistant turn followed by a user message
|
|
that signals disagreement, then a better assistant turn.
|
|
Signals: "no,", "actually", "that's wrong", "you should", "remember"
|
|
"""
|
|
correction_signals = ["no,", "actually", "that's wrong", "you should have",
|
|
"remember that", "i told you", "stop doing", "wrong"]
|
|
pairs = []
|
|
convs = [c for c in conversations if c["from"] in ("human", "gpt")]
|
|
|
|
for i in range(1, len(convs) - 1):
|
|
if convs[i]["from"] != "human":
|
|
continue
|
|
user_text = convs[i]["value"].lower()
|
|
if not any(sig in user_text for sig in correction_signals):
|
|
continue
|
|
if i < 1 or convs[i-1]["from"] != "gpt":
|
|
continue
|
|
if i + 1 >= len(convs) or convs[i+1]["from"] != "gpt":
|
|
continue
|
|
|
|
rejected = convs[i-1]["value"]
|
|
chosen = convs[i+1]["value"]
|
|
|
|
# Only worth it if the chosen is meaningfully different
|
|
if len(chosen) > 50 and chosen != rejected:
|
|
pairs.append({
|
|
"prompt": convs[i-2]["value"] if i >= 2 else "",
|
|
"chosen": chosen,
|
|
"rejected": rejected,
|
|
"context": convs[i]["value"], # the correction itself
|
|
})
|
|
return pairs
|
|
|
|
|
|
def process_file(path: str):
|
|
"""
|
|
Process one raw export JSONL file.
|
|
Returns (clean_examples, dpo_pairs).
|
|
"""
|
|
clean_examples = []
|
|
dpo_pairs = []
|
|
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
example = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
raw_convs = example.get("conversations", [])
|
|
|
|
# Separate system prompt from turns
|
|
turns = [c for c in raw_convs if c.get("from") != "system"]
|
|
|
|
# Clean user turns
|
|
cleaned = []
|
|
for turn in turns:
|
|
if turn["from"] == "human":
|
|
text = clean_text(turn["value"])
|
|
if len(text) < 5:
|
|
continue
|
|
cleaned.append({"from": "human", "value": text})
|
|
elif turn["from"] == "gpt":
|
|
text = turn["value"].strip()
|
|
if not is_good_assistant_turn(text):
|
|
continue
|
|
cleaned.append({"from": "gpt", "value": text})
|
|
|
|
# Build valid human/gpt pairs only
|
|
valid_turns = []
|
|
i = 0
|
|
while i < len(cleaned) - 1:
|
|
if cleaned[i]["from"] == "human" and cleaned[i+1]["from"] == "gpt":
|
|
valid_turns.append(cleaned[i])
|
|
valid_turns.append(cleaned[i+1])
|
|
i += 2
|
|
else:
|
|
i += 1
|
|
|
|
if len(valid_turns) < MIN_TURNS * 2:
|
|
continue
|
|
|
|
clean_examples.append({
|
|
"conversations": [{"from": "system", "value": SYSTEM_PROMPT}] + valid_turns,
|
|
"source": os.path.basename(path),
|
|
"converted_at": datetime.now(timezone.utc).isoformat(),
|
|
})
|
|
|
|
# Extract DPO pairs from this example
|
|
dpo_pairs.extend(extract_dpo_pairs(valid_turns))
|
|
|
|
return clean_examples, dpo_pairs
|
|
|
|
|
|
def scp_file(local_path: str, remote_subdir: str):
|
|
# Data is on NFS — already written directly, no SCP needed
|
|
print(f" Written to NFS: {local_path}")
|
|
|
|
|
|
def main():
|
|
os.makedirs(CLEAN_DIR, exist_ok=True)
|
|
os.makedirs(DPO_DIR, exist_ok=True)
|
|
|
|
state = load_state()
|
|
input_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.jsonl")))
|
|
new_files = [f for f in input_files if os.path.basename(f) not in state["converted_files"]]
|
|
|
|
if not new_files:
|
|
print("No new files to convert.")
|
|
return
|
|
|
|
all_clean = []
|
|
all_dpo = []
|
|
|
|
for f in new_files:
|
|
print(f"Processing: {os.path.basename(f)}")
|
|
clean, dpo = process_file(f)
|
|
print(f" → {len(clean)} clean examples, {len(dpo)} DPO pairs")
|
|
all_clean.extend(clean)
|
|
all_dpo.extend(dpo)
|
|
state["converted_files"].append(os.path.basename(f))
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
|
|
if all_clean:
|
|
clean_file = os.path.join(CLEAN_DIR, f"grace_clean_{timestamp}.jsonl")
|
|
with open(clean_file, "w") as f:
|
|
for ex in all_clean:
|
|
f.write(json.dumps(ex) + "\n")
|
|
print(f"\nWrote {len(all_clean)} clean examples → {clean_file}")
|
|
scp_file(clean_file, "cleaned")
|
|
state["total_clean"] = state.get("total_clean", 0) + len(all_clean)
|
|
|
|
if all_dpo:
|
|
dpo_file = os.path.join(DPO_DIR, f"grace_dpo_{timestamp}.jsonl")
|
|
with open(dpo_file, "w") as f:
|
|
for pair in all_dpo:
|
|
f.write(json.dumps(pair) + "\n")
|
|
print(f"Wrote {len(all_dpo)} DPO pairs → {dpo_file}")
|
|
scp_file(dpo_file, "dpo")
|
|
state["total_dpo"] = state.get("total_dpo", 0) + len(all_dpo)
|
|
|
|
save_state(state)
|
|
print(f"\nTotals to date: {state['total_clean']} clean, {state['total_dpo']} DPO pairs")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|