feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db
- Infra tooling: infra-audit, infra-gitea-link
- RAG pipeline: rag-ingest, rag-query
- Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth
- Transcripts: export-transcripts
- Updated README with script index and token reduction strategy
This commit is contained in:
Grace
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions

139
load-transcripts-to-db.py Normal file
View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""
Load Grace transcript JSONL files into PostgreSQL on DB VM (192.168.20.87).
Tables:
sessions — one row per conversation session
turns — one row per turn (system/human/gpt)
session_tags — optional labels
Run nightly after export-transcripts.py.
"""
import json
import os
import glob
import psycopg2
from datetime import datetime, timezone
TRANSCRIPTS_DIR = "/mnt/ai-storage/grace/training-data/transcripts"
STATE_FILE = os.path.expanduser("~/self-improving/db-load-state.json")
DB_CONFIG = {
"host": "192.168.20.87",
"port": 5432,
"dbname": "grace_training",
"user": "grace_ai",
"password": "grace_training_2026",
}
def load_state():
if os.path.exists(STATE_FILE):
with open(STATE_FILE) as f:
return json.load(f)
return {"loaded_files": [], "total_sessions": 0, "total_turns": 0}
def save_state(state):
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
def load_file(conn, path: str) -> tuple[int, int]:
"""Load one transcript JSONL file. Returns (sessions_added, turns_added)."""
sessions_added = 0
turns_added = 0
with open(path) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
example = json.loads(line)
except json.JSONDecodeError:
continue
source_file = example.get("source", os.path.basename(path))
session_id = source_file.replace(".jsonl", "")
turn_count = example.get("turn_count", 0)
exported_at = example.get("exported_at", datetime.now(timezone.utc).isoformat())
conversations = example.get("conversations", [])
with conn.cursor() as cur:
# Upsert session (skip if already loaded)
cur.execute("""
INSERT INTO sessions (session_id, source_file, turn_count, exported_at)
VALUES (%s, %s, %s, %s)
ON CONFLICT (session_id) DO NOTHING
RETURNING id
""", (session_id, source_file, turn_count, exported_at))
row = cur.fetchone()
if not row:
# Already exists, skip
continue
sessions_added += 1
# Insert all turns
for i, turn in enumerate(conversations):
role = turn.get("from", "")
content = turn.get("value", "")
if not content:
continue
cur.execute("""
INSERT INTO turns (session_id, turn_index, role, content)
VALUES (%s, %s, %s, %s)
""", (session_id, i, role, content))
turns_added += 1
conn.commit()
return sessions_added, turns_added
def main():
state = load_state()
transcript_files = sorted(glob.glob(os.path.join(TRANSCRIPTS_DIR, "*.jsonl")))
new_files = [f for f in transcript_files if os.path.basename(f) not in state["loaded_files"]]
if not new_files:
print("No new transcript files to load.")
return
try:
conn = psycopg2.connect(**DB_CONFIG)
except Exception as e:
print(f"DB connection failed: {e}")
return
total_sessions = 0
total_turns = 0
for f in new_files:
print(f"Loading: {os.path.basename(f)}")
try:
s, t = load_file(conn, f)
print(f"{s} sessions, {t} turns")
total_sessions += s
total_turns += t
state["loaded_files"].append(os.path.basename(f))
except Exception as e:
print(f" Error: {e}")
conn.rollback()
conn.close()
state["total_sessions"] = state.get("total_sessions", 0) + total_sessions
state["total_turns"] = state.get("total_turns", 0) + total_turns
save_state(state)
print(f"\nLoaded {total_sessions} sessions, {total_turns} turns")
print(f"DB totals: {state['total_sessions']} sessions, {state['total_turns']} turns")
if __name__ == "__main__":
main()