#!/usr/bin/env python3 """ Gitea → MongoDB infrastructure linker. For each VM/LXC/service in homelab_infra MongoDB, finds matching Gitea issues and recent commits, attaches them as gitea_links. Matching strategy: - Extracts keywords from component name (strips common words) - Searches issue titles and commit messages for those keywords - Requires strong keyword overlap to avoid false positives - Repos searched: infra/grace, infra/homelab (primary infra repos only) - Also checks Grace/homelab-ai-agent and projects/* for service-level matches Runs nightly. Safe to re-run — always overwrites gitea_links field. Usage: python3 ~/scripts/infra-gitea-link.py python3 ~/scripts/infra-gitea-link.py --component caddy python3 ~/scripts/infra-gitea-link.py --dry-run """ import argparse import json import os import re import urllib.request from datetime import datetime, timezone GITEA_BASE = "http://192.168.20.132:3000/api/v1" GITEA_TOKEN = "dc5381bb236a820c2278da5199f379acda4bca93" MONGO_HOST = "192.168.20.87" MONGO_DB = "homelab_infra" # Repos to search — ordered by relevance INFRA_REPOS = ["infra/homelab", "infra/grace"] SERVICE_REPOS = ["Grace/homelab-ai-agent", "projects/homelab-dashy"] ALL_REPOS = INFRA_REPOS + SERVICE_REPOS # Words too generic to use as match keywords STOPWORDS = { "vm", "lxc", "server", "node", "local", "home", "main", "new", "old", "the", "and", "for", "with", "this", "that", "from", "into", "onto", "setup", "install", "config", "update", "add", "fix", "test", "run", "docker", "container", "service", "system", "data", "base", "storage", "port", "host", "ip", "api", "http", "https", "move", "migrate", "enable", "disable", "check", "get", "set", "use", "via", } # Component → repo affinity: which repos are most relevant per component type REPO_AFFINITY = { "ai_inference": ["infra/grace", "Grace/homelab-ai-agent"], "ai_embeddings": ["infra/grace", "Grace/homelab-ai-agent"], "vector_db": ["infra/grace"], "reverse_proxy": ["infra/homelab"], "hypervisor": ["infra/homelab"], "nas": ["infra/homelab"], "firewall": ["infra/homelab"], "chat": ["infra/homelab"], "git": ["infra/homelab"], "monitoring": ["infra/homelab"], "database": ["infra/grace", "infra/homelab"], "proxmox_node": ["infra/homelab"], "qemu": ["infra/homelab"], "lxc": ["infra/homelab"], } # ── Gitea API helpers ───────────────────────────────────────────────────────── def gitea_get(path, params=None): url = f"{GITEA_BASE}/{path}" if params: url += "?" + "&".join(f"{k}={v}" for k, v in params.items()) req = urllib.request.Request(url, headers={"Authorization": f"token {GITEA_TOKEN}"}) try: with urllib.request.urlopen(req, timeout=10) as r: return json.loads(r.read()) except Exception as e: return None def get_all_issues(repo, state="open"): """Get all issues from a repo (paginated).""" issues = [] page = 1 while True: batch = gitea_get(f"repos/{repo}/issues", {"type": "issues", "state": state, "limit": 50, "page": page}) if not batch: break issues.extend(batch) if len(batch) < 50: break page += 1 return issues def get_recent_commits(repo, limit=20): """Get recent commits from default branch.""" commits = gitea_get(f"repos/{repo}/commits", {"limit": limit, "page": 1}) return commits or [] # ── Keyword extraction ──────────────────────────────────────────────────────── def extract_keywords(component: dict) -> set[str]: """ Extract meaningful match keywords from a component document. Pulls from: name, type, notes, host fields. """ raw_text = " ".join([ component.get("name", ""), component.get("type", ""), component.get("notes", ""), component.get("host", ""), ]).lower() # Split on non-alpha and filter stopwords + short tokens tokens = re.split(r'[^a-z0-9]+', raw_text) keywords = {t for t in tokens if len(t) >= 4 and t not in STOPWORDS} # Special: expand known aliases aliases = { "proxmox": {"pve", "proxmox"}, "caddy": {"caddy", "reverse", "proxy", "caddyfile"}, "matrix": {"matrix", "synapse", "element"}, "nextcloud": {"nextcloud", "drive"}, "gitea": {"gitea", "git"}, "truenas": {"truenas", "tank", "zfs", "pool"}, "opnsense":{"opnsense", "firewall", "router", "unbound"}, "grafana": {"grafana", "dashboard"}, "qdrant": {"qdrant", "vector"}, "ollama": {"ollama", "llama"}, "qwen": {"qwen", "llm", "inference"}, "mongodb": {"mongodb", "mongo"}, } for key, alias_set in aliases.items(): if key in keywords: keywords |= alias_set return keywords def score_match(text: str, keywords: set[str]) -> int: """Score how many keywords appear in text. Returns match count.""" text_lower = text.lower() return sum(1 for kw in keywords if kw in text_lower) def find_matches(component: dict, all_issues: dict, all_commits: dict) -> dict: """ Find matching issues and commits for a component. Returns dict with matched issues and commits. """ keywords = extract_keywords(component) if not keywords: return {} comp_type = component.get("type", "") comp_name = component.get("name", "") # Determine which repos to search based on type affinity preferred_repos = REPO_AFFINITY.get(comp_type, ALL_REPOS) matched_issues = [] matched_commits = [] # Minimum score threshold — require at least 2 keyword hits # UNLESS the component name itself is a strong unique identifier (>=6 chars) name_is_unique = len(comp_name) >= 6 and comp_name not in STOPWORDS min_score = 1 if name_is_unique else 2 for repo in preferred_repos: issues = all_issues.get(repo, []) commits = all_commits.get(repo, []) for issue in issues: title = issue.get("title", "") body = issue.get("body", "") or "" score = score_match(title, keywords) + score_match(body[:500], keywords) * 0.5 if score >= min_score: matched_issues.append({ "repo": repo, "number": issue["number"], "title": title, "state": issue.get("state"), "url": issue.get("html_url"), "score": score, "updated_at": issue.get("updated_at"), }) for commit in commits: msg = commit.get("commit", {}).get("message", "").split("\n")[0] if score_match(msg, keywords) >= min_score: matched_commits.append({ "repo": repo, "sha": commit.get("sha", "")[:8], "message": msg, "url": commit.get("html_url"), "date": commit.get("commit", {}).get("author", {}).get("date"), }) # Sort by score desc, take top 3 of each matched_issues = sorted(matched_issues, key=lambda x: -x["score"])[:3] matched_commits = sorted(matched_commits, key=lambda x: x.get("date",""), reverse=True)[:3] # Strip internal score field before storing for i in matched_issues: i.pop("score", None) return { "issues": matched_issues, "commits": matched_commits, "keywords_used": list(keywords)[:10], "linked_at": datetime.now(timezone.utc).isoformat(), } # ── MongoDB helpers ─────────────────────────────────────────────────────────── _mongo_client = None def get_mongo(): global _mongo_client if _mongo_client is None: import pymongo _mongo_client = pymongo.MongoClient(MONGO_HOST, 27017, serverSelectionTimeoutMS=5000) return _mongo_client[MONGO_DB] def mongo_query_all(collection): try: return list(get_mongo()[collection].find({}, {"_id": 0})) except Exception as e: print(f" Mongo query error: {e}") return [] def mongo_update_gitea_links(collection, name_value, links): try: get_mongo()[collection].update_one({"name": name_value}, {"$set": {"gitea_links": links}}) return True except Exception as e: print(f" Mongo update error: {e}") return False # ── Main ────────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Link Gitea issues to homelab_infra MongoDB") parser.add_argument("--component", type=str, help="Only process this component name") parser.add_argument("--dry-run", action="store_true", help="Show matches without writing to DB") args = parser.parse_args() print("=== Gitea → MongoDB infrastructure linker ===") print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") # Pre-fetch all issues and commits from all repos (batch to minimize API calls) print("\nFetching Gitea data...") all_issues = {} all_commits = {} for repo in ALL_REPOS: open_issues = get_all_issues(repo, state="open") closed_issues = get_all_issues(repo, state="closed") all_issues[repo] = open_issues + closed_issues all_commits[repo] = get_recent_commits(repo, limit=30) print(f" {repo}: {len(all_issues[repo])} issues, {len(all_commits[repo])} commits") # Process each collection total_linked = 0 collections = ["services", "vms", "nodes", "storage"] for collection in collections: docs = mongo_query_all(collection) if not docs: continue print(f"\n[{collection}] {len(docs)} components...") for doc in docs: name = doc.get("name", "") if not name: continue if args.component and args.component.lower() not in name.lower(): continue links = find_matches(doc, all_issues, all_commits) issue_count = len(links.get("issues", [])) commit_count = len(links.get("commits", [])) if issue_count or commit_count: print(f" {name}: {issue_count} issues, {commit_count} commits") if args.dry_run: for i in links.get("issues", []): print(f" ISSUE #{i['number']}: {i['title']} [{i['state']}]") for c in links.get("commits", []): print(f" COMMIT {c['sha']}: {c['message'][:60]}") else: mongo_update_gitea_links(collection, name, links) total_linked += 1 else: # Still write empty links to clear stale data if not args.dry_run: mongo_update_gitea_links(collection, name, { "issues": [], "commits": [], "keywords_used": list(extract_keywords(doc))[:10], "linked_at": datetime.now(timezone.utc).isoformat(), }) if not args.dry_run: print(f"\n{total_linked} components linked to Gitea data") print("Done.") if __name__ == "__main__": main()