feat: initial import of all helper scripts from ~/scripts/

- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
2026-03-16 22:32:48 -07:00
parent 462f5298e6
commit 014ec8bd5c
15 changed files with 2979 additions and 1 deletions
--- a/infra-gitea-link.py
+++ b/infra-gitea-link.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Gitea → MongoDB infrastructure linker.
+
+For each VM/LXC/service in homelab_infra MongoDB, finds matching Gitea issues
+and recent commits, attaches them as gitea_links.
+
+Matching strategy:
+  - Extracts keywords from component name (strips common words)
+  - Searches issue titles and commit messages for those keywords
+  - Requires strong keyword overlap to avoid false positives
+  - Repos searched: infra/grace, infra/homelab (primary infra repos only)
+  - Also checks Grace/homelab-ai-agent and projects/* for service-level matches
+
+Runs nightly. Safe to re-run — always overwrites gitea_links field.
+
+Usage:
+    python3 ~/scripts/infra-gitea-link.py
+    python3 ~/scripts/infra-gitea-link.py --component caddy
+    python3 ~/scripts/infra-gitea-link.py --dry-run
+"""
+
+import argparse
+import json
+import os
+import re
+import urllib.request
+from datetime import datetime, timezone
+
+GITEA_BASE  = "http://192.168.20.132:3000/api/v1"
+GITEA_TOKEN = "dc5381bb236a820c2278da5199f379acda4bca93"
+MONGO_HOST  = "192.168.20.87"
+MONGO_DB    = "homelab_infra"
+
+# Repos to search — ordered by relevance
+INFRA_REPOS    = ["infra/homelab", "infra/grace"]
+SERVICE_REPOS  = ["Grace/homelab-ai-agent", "projects/homelab-dashy"]
+ALL_REPOS      = INFRA_REPOS + SERVICE_REPOS
+
+# Words too generic to use as match keywords
+STOPWORDS = {
+    "vm", "lxc", "server", "node", "local", "home", "main", "new", "old",
+    "the", "and", "for", "with", "this", "that", "from", "into", "onto",
+    "setup", "install", "config", "update", "add", "fix", "test", "run",
+    "docker", "container", "service", "system", "data", "base", "storage",
+    "port", "host", "ip", "api", "http", "https", "move", "migrate",
+    "enable", "disable", "check", "get", "set", "use", "via",
+}
+
+# Component → repo affinity: which repos are most relevant per component type
+REPO_AFFINITY = {
+    "ai_inference":   ["infra/grace", "Grace/homelab-ai-agent"],
+    "ai_embeddings":  ["infra/grace", "Grace/homelab-ai-agent"],
+    "vector_db":      ["infra/grace"],
+    "reverse_proxy":  ["infra/homelab"],
+    "hypervisor":     ["infra/homelab"],
+    "nas":            ["infra/homelab"],
+    "firewall":       ["infra/homelab"],
+    "chat":           ["infra/homelab"],
+    "git":            ["infra/homelab"],
+    "monitoring":     ["infra/homelab"],
+    "database":       ["infra/grace", "infra/homelab"],
+    "proxmox_node":   ["infra/homelab"],
+    "qemu":           ["infra/homelab"],
+    "lxc":            ["infra/homelab"],
+}
+
+
+# ── Gitea API helpers ─────────────────────────────────────────────────────────
+
+def gitea_get(path, params=None):
+    url = f"{GITEA_BASE}/{path}"
+    if params:
+        url += "?" + "&".join(f"{k}={v}" for k, v in params.items())
+    req = urllib.request.Request(url, headers={"Authorization": f"token {GITEA_TOKEN}"})
+    try:
+        with urllib.request.urlopen(req, timeout=10) as r:
+            return json.loads(r.read())
+    except Exception as e:
+        return None
+
+
+def get_all_issues(repo, state="open"):
+    """Get all issues from a repo (paginated)."""
+    issues = []
+    page = 1
+    while True:
+        batch = gitea_get(f"repos/{repo}/issues",
+                          {"type": "issues", "state": state, "limit": 50, "page": page})
+        if not batch:
+            break
+        issues.extend(batch)
+        if len(batch) < 50:
+            break
+        page += 1
+    return issues
+
+
+def get_recent_commits(repo, limit=20):
+    """Get recent commits from default branch."""
+    commits = gitea_get(f"repos/{repo}/commits", {"limit": limit, "page": 1})
+    return commits or []
+
+
+# ── Keyword extraction ────────────────────────────────────────────────────────
+
+def extract_keywords(component: dict) -> set[str]:
+    """
+    Extract meaningful match keywords from a component document.
+    Pulls from: name, type, notes, host fields.
+    """
+    raw_text = " ".join([
+        component.get("name", ""),
+        component.get("type", ""),
+        component.get("notes", ""),
+        component.get("host", ""),
+    ]).lower()
+
+    # Split on non-alpha and filter stopwords + short tokens
+    tokens = re.split(r'[^a-z0-9]+', raw_text)
+    keywords = {t for t in tokens if len(t) >= 4 and t not in STOPWORDS}
+
+    # Special: expand known aliases
+    aliases = {
+        "proxmox": {"pve", "proxmox"},
+        "caddy":   {"caddy", "reverse", "proxy", "caddyfile"},
+        "matrix":  {"matrix", "synapse", "element"},
+        "nextcloud": {"nextcloud", "drive"},
+        "gitea":   {"gitea", "git"},
+        "truenas": {"truenas", "tank", "zfs", "pool"},
+        "opnsense":{"opnsense", "firewall", "router", "unbound"},
+        "grafana": {"grafana", "dashboard"},
+        "qdrant":  {"qdrant", "vector"},
+        "ollama":  {"ollama", "llama"},
+        "qwen":    {"qwen", "llm", "inference"},
+        "mongodb": {"mongodb", "mongo"},
+    }
+    for key, alias_set in aliases.items():
+        if key in keywords:
+            keywords |= alias_set
+
+    return keywords
+
+
+def score_match(text: str, keywords: set[str]) -> int:
+    """Score how many keywords appear in text. Returns match count."""
+    text_lower = text.lower()
+    return sum(1 for kw in keywords if kw in text_lower)
+
+
+def find_matches(component: dict, all_issues: dict, all_commits: dict) -> dict:
+    """
+    Find matching issues and commits for a component.
+    Returns dict with matched issues and commits.
+    """
+    keywords = extract_keywords(component)
+    if not keywords:
+        return {}
+
+    comp_type = component.get("type", "")
+    comp_name = component.get("name", "")
+
+    # Determine which repos to search based on type affinity
+    preferred_repos = REPO_AFFINITY.get(comp_type, ALL_REPOS)
+
+    matched_issues  = []
+    matched_commits = []
+
+    # Minimum score threshold — require at least 2 keyword hits
+    # UNLESS the component name itself is a strong unique identifier (>=6 chars)
+    name_is_unique = len(comp_name) >= 6 and comp_name not in STOPWORDS
+    min_score = 1 if name_is_unique else 2
+
+    for repo in preferred_repos:
+        issues  = all_issues.get(repo, [])
+        commits = all_commits.get(repo, [])
+
+        for issue in issues:
+            title = issue.get("title", "")
+            body  = issue.get("body", "") or ""
+            score = score_match(title, keywords) + score_match(body[:500], keywords) * 0.5
+            if score >= min_score:
+                matched_issues.append({
+                    "repo":   repo,
+                    "number": issue["number"],
+                    "title":  title,
+                    "state":  issue.get("state"),
+                    "url":    issue.get("html_url"),
+                    "score":  score,
+                    "updated_at": issue.get("updated_at"),
+                })
+
+        for commit in commits:
+            msg = commit.get("commit", {}).get("message", "").split("\n")[0]
+            if score_match(msg, keywords) >= min_score:
+                matched_commits.append({
+                    "repo":    repo,
+                    "sha":     commit.get("sha", "")[:8],
+                    "message": msg,
+                    "url":     commit.get("html_url"),
+                    "date":    commit.get("commit", {}).get("author", {}).get("date"),
+                })
+
+    # Sort by score desc, take top 3 of each
+    matched_issues  = sorted(matched_issues,  key=lambda x: -x["score"])[:3]
+    matched_commits = sorted(matched_commits, key=lambda x: x.get("date",""), reverse=True)[:3]
+
+    # Strip internal score field before storing
+    for i in matched_issues:
+        i.pop("score", None)
+
+    return {
+        "issues":  matched_issues,
+        "commits": matched_commits,
+        "keywords_used": list(keywords)[:10],
+        "linked_at": datetime.now(timezone.utc).isoformat(),
+    }
+
+
+# ── MongoDB helpers ───────────────────────────────────────────────────────────
+_mongo_client = None
+
+def get_mongo():
+    global _mongo_client
+    if _mongo_client is None:
+        import pymongo
+        _mongo_client = pymongo.MongoClient(MONGO_HOST, 27017, serverSelectionTimeoutMS=5000)
+    return _mongo_client[MONGO_DB]
+
+
+def mongo_query_all(collection):
+    try:
+        return list(get_mongo()[collection].find({}, {"_id": 0}))
+    except Exception as e:
+        print(f"  Mongo query error: {e}")
+        return []
+
+
+def mongo_update_gitea_links(collection, name_value, links):
+    try:
+        get_mongo()[collection].update_one({"name": name_value}, {"$set": {"gitea_links": links}})
+        return True
+    except Exception as e:
+        print(f"  Mongo update error: {e}")
+        return False
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Link Gitea issues to homelab_infra MongoDB")
+    parser.add_argument("--component", type=str, help="Only process this component name")
+    parser.add_argument("--dry-run",   action="store_true", help="Show matches without writing to DB")
+    args = parser.parse_args()
+
+    print("=== Gitea → MongoDB infrastructure linker ===")
+    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    # Pre-fetch all issues and commits from all repos (batch to minimize API calls)
+    print("\nFetching Gitea data...")
+    all_issues  = {}
+    all_commits = {}
+
+    for repo in ALL_REPOS:
+        open_issues   = get_all_issues(repo, state="open")
+        closed_issues = get_all_issues(repo, state="closed")
+        all_issues[repo]  = open_issues + closed_issues
+        all_commits[repo] = get_recent_commits(repo, limit=30)
+        print(f"  {repo}: {len(all_issues[repo])} issues, {len(all_commits[repo])} commits")
+
+    # Process each collection
+    total_linked = 0
+    collections  = ["services", "vms", "nodes", "storage"]
+
+    for collection in collections:
+        docs = mongo_query_all(collection)
+        if not docs:
+            continue
+
+        print(f"\n[{collection}] {len(docs)} components...")
+
+        for doc in docs:
+            name = doc.get("name", "")
+            if not name:
+                continue
+            if args.component and args.component.lower() not in name.lower():
+                continue
+
+            links = find_matches(doc, all_issues, all_commits)
+
+            issue_count  = len(links.get("issues", []))
+            commit_count = len(links.get("commits", []))
+
+            if issue_count or commit_count:
+                print(f"  {name}: {issue_count} issues, {commit_count} commits")
+                if args.dry_run:
+                    for i in links.get("issues", []):
+                        print(f"    ISSUE #{i['number']}: {i['title']} [{i['state']}]")
+                    for c in links.get("commits", []):
+                        print(f"    COMMIT {c['sha']}: {c['message'][:60]}")
+                else:
+                    mongo_update_gitea_links(collection, name, links)
+                    total_linked += 1
+            else:
+                # Still write empty links to clear stale data
+                if not args.dry_run:
+                    mongo_update_gitea_links(collection, name, {
+                        "issues": [], "commits": [],
+                        "keywords_used": list(extract_keywords(doc))[:10],
+                        "linked_at": datetime.now(timezone.utc).isoformat(),
+                    })
+
+    if not args.dry_run:
+        print(f"\n{total_linked} components linked to Gitea data")
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()