feat: initial import of all helper scripts from ~/scripts/
- Training data pipeline: convert, export, extract, load-to-db - Infra tooling: infra-audit, infra-gitea-link - RAG pipeline: rag-ingest, rag-query - Fine-tuning: finetune-lora, overnight-qwen3, install-unsloth - Transcripts: export-transcripts - Updated README with script index and token reduction strategy
This commit is contained in:
319
infra-gitea-link.py
Executable file
319
infra-gitea-link.py
Executable file
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Gitea → MongoDB infrastructure linker.
|
||||
|
||||
For each VM/LXC/service in homelab_infra MongoDB, finds matching Gitea issues
|
||||
and recent commits, attaches them as gitea_links.
|
||||
|
||||
Matching strategy:
|
||||
- Extracts keywords from component name (strips common words)
|
||||
- Searches issue titles and commit messages for those keywords
|
||||
- Requires strong keyword overlap to avoid false positives
|
||||
- Repos searched: infra/grace, infra/homelab (primary infra repos only)
|
||||
- Also checks Grace/homelab-ai-agent and projects/* for service-level matches
|
||||
|
||||
Runs nightly. Safe to re-run — always overwrites gitea_links field.
|
||||
|
||||
Usage:
|
||||
python3 ~/scripts/infra-gitea-link.py
|
||||
python3 ~/scripts/infra-gitea-link.py --component caddy
|
||||
python3 ~/scripts/infra-gitea-link.py --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
|
||||
GITEA_BASE = "http://192.168.20.132:3000/api/v1"
|
||||
GITEA_TOKEN = "dc5381bb236a820c2278da5199f379acda4bca93"
|
||||
MONGO_HOST = "192.168.20.87"
|
||||
MONGO_DB = "homelab_infra"
|
||||
|
||||
# Repos to search — ordered by relevance
|
||||
INFRA_REPOS = ["infra/homelab", "infra/grace"]
|
||||
SERVICE_REPOS = ["Grace/homelab-ai-agent", "projects/homelab-dashy"]
|
||||
ALL_REPOS = INFRA_REPOS + SERVICE_REPOS
|
||||
|
||||
# Words too generic to use as match keywords
|
||||
STOPWORDS = {
|
||||
"vm", "lxc", "server", "node", "local", "home", "main", "new", "old",
|
||||
"the", "and", "for", "with", "this", "that", "from", "into", "onto",
|
||||
"setup", "install", "config", "update", "add", "fix", "test", "run",
|
||||
"docker", "container", "service", "system", "data", "base", "storage",
|
||||
"port", "host", "ip", "api", "http", "https", "move", "migrate",
|
||||
"enable", "disable", "check", "get", "set", "use", "via",
|
||||
}
|
||||
|
||||
# Component → repo affinity: which repos are most relevant per component type
|
||||
REPO_AFFINITY = {
|
||||
"ai_inference": ["infra/grace", "Grace/homelab-ai-agent"],
|
||||
"ai_embeddings": ["infra/grace", "Grace/homelab-ai-agent"],
|
||||
"vector_db": ["infra/grace"],
|
||||
"reverse_proxy": ["infra/homelab"],
|
||||
"hypervisor": ["infra/homelab"],
|
||||
"nas": ["infra/homelab"],
|
||||
"firewall": ["infra/homelab"],
|
||||
"chat": ["infra/homelab"],
|
||||
"git": ["infra/homelab"],
|
||||
"monitoring": ["infra/homelab"],
|
||||
"database": ["infra/grace", "infra/homelab"],
|
||||
"proxmox_node": ["infra/homelab"],
|
||||
"qemu": ["infra/homelab"],
|
||||
"lxc": ["infra/homelab"],
|
||||
}
|
||||
|
||||
|
||||
# ── Gitea API helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
def gitea_get(path, params=None):
|
||||
url = f"{GITEA_BASE}/{path}"
|
||||
if params:
|
||||
url += "?" + "&".join(f"{k}={v}" for k, v in params.items())
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"token {GITEA_TOKEN}"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=10) as r:
|
||||
return json.loads(r.read())
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def get_all_issues(repo, state="open"):
|
||||
"""Get all issues from a repo (paginated)."""
|
||||
issues = []
|
||||
page = 1
|
||||
while True:
|
||||
batch = gitea_get(f"repos/{repo}/issues",
|
||||
{"type": "issues", "state": state, "limit": 50, "page": page})
|
||||
if not batch:
|
||||
break
|
||||
issues.extend(batch)
|
||||
if len(batch) < 50:
|
||||
break
|
||||
page += 1
|
||||
return issues
|
||||
|
||||
|
||||
def get_recent_commits(repo, limit=20):
|
||||
"""Get recent commits from default branch."""
|
||||
commits = gitea_get(f"repos/{repo}/commits", {"limit": limit, "page": 1})
|
||||
return commits or []
|
||||
|
||||
|
||||
# ── Keyword extraction ────────────────────────────────────────────────────────
|
||||
|
||||
def extract_keywords(component: dict) -> set[str]:
|
||||
"""
|
||||
Extract meaningful match keywords from a component document.
|
||||
Pulls from: name, type, notes, host fields.
|
||||
"""
|
||||
raw_text = " ".join([
|
||||
component.get("name", ""),
|
||||
component.get("type", ""),
|
||||
component.get("notes", ""),
|
||||
component.get("host", ""),
|
||||
]).lower()
|
||||
|
||||
# Split on non-alpha and filter stopwords + short tokens
|
||||
tokens = re.split(r'[^a-z0-9]+', raw_text)
|
||||
keywords = {t for t in tokens if len(t) >= 4 and t not in STOPWORDS}
|
||||
|
||||
# Special: expand known aliases
|
||||
aliases = {
|
||||
"proxmox": {"pve", "proxmox"},
|
||||
"caddy": {"caddy", "reverse", "proxy", "caddyfile"},
|
||||
"matrix": {"matrix", "synapse", "element"},
|
||||
"nextcloud": {"nextcloud", "drive"},
|
||||
"gitea": {"gitea", "git"},
|
||||
"truenas": {"truenas", "tank", "zfs", "pool"},
|
||||
"opnsense":{"opnsense", "firewall", "router", "unbound"},
|
||||
"grafana": {"grafana", "dashboard"},
|
||||
"qdrant": {"qdrant", "vector"},
|
||||
"ollama": {"ollama", "llama"},
|
||||
"qwen": {"qwen", "llm", "inference"},
|
||||
"mongodb": {"mongodb", "mongo"},
|
||||
}
|
||||
for key, alias_set in aliases.items():
|
||||
if key in keywords:
|
||||
keywords |= alias_set
|
||||
|
||||
return keywords
|
||||
|
||||
|
||||
def score_match(text: str, keywords: set[str]) -> int:
|
||||
"""Score how many keywords appear in text. Returns match count."""
|
||||
text_lower = text.lower()
|
||||
return sum(1 for kw in keywords if kw in text_lower)
|
||||
|
||||
|
||||
def find_matches(component: dict, all_issues: dict, all_commits: dict) -> dict:
|
||||
"""
|
||||
Find matching issues and commits for a component.
|
||||
Returns dict with matched issues and commits.
|
||||
"""
|
||||
keywords = extract_keywords(component)
|
||||
if not keywords:
|
||||
return {}
|
||||
|
||||
comp_type = component.get("type", "")
|
||||
comp_name = component.get("name", "")
|
||||
|
||||
# Determine which repos to search based on type affinity
|
||||
preferred_repos = REPO_AFFINITY.get(comp_type, ALL_REPOS)
|
||||
|
||||
matched_issues = []
|
||||
matched_commits = []
|
||||
|
||||
# Minimum score threshold — require at least 2 keyword hits
|
||||
# UNLESS the component name itself is a strong unique identifier (>=6 chars)
|
||||
name_is_unique = len(comp_name) >= 6 and comp_name not in STOPWORDS
|
||||
min_score = 1 if name_is_unique else 2
|
||||
|
||||
for repo in preferred_repos:
|
||||
issues = all_issues.get(repo, [])
|
||||
commits = all_commits.get(repo, [])
|
||||
|
||||
for issue in issues:
|
||||
title = issue.get("title", "")
|
||||
body = issue.get("body", "") or ""
|
||||
score = score_match(title, keywords) + score_match(body[:500], keywords) * 0.5
|
||||
if score >= min_score:
|
||||
matched_issues.append({
|
||||
"repo": repo,
|
||||
"number": issue["number"],
|
||||
"title": title,
|
||||
"state": issue.get("state"),
|
||||
"url": issue.get("html_url"),
|
||||
"score": score,
|
||||
"updated_at": issue.get("updated_at"),
|
||||
})
|
||||
|
||||
for commit in commits:
|
||||
msg = commit.get("commit", {}).get("message", "").split("\n")[0]
|
||||
if score_match(msg, keywords) >= min_score:
|
||||
matched_commits.append({
|
||||
"repo": repo,
|
||||
"sha": commit.get("sha", "")[:8],
|
||||
"message": msg,
|
||||
"url": commit.get("html_url"),
|
||||
"date": commit.get("commit", {}).get("author", {}).get("date"),
|
||||
})
|
||||
|
||||
# Sort by score desc, take top 3 of each
|
||||
matched_issues = sorted(matched_issues, key=lambda x: -x["score"])[:3]
|
||||
matched_commits = sorted(matched_commits, key=lambda x: x.get("date",""), reverse=True)[:3]
|
||||
|
||||
# Strip internal score field before storing
|
||||
for i in matched_issues:
|
||||
i.pop("score", None)
|
||||
|
||||
return {
|
||||
"issues": matched_issues,
|
||||
"commits": matched_commits,
|
||||
"keywords_used": list(keywords)[:10],
|
||||
"linked_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
|
||||
# ── MongoDB helpers ───────────────────────────────────────────────────────────
|
||||
_mongo_client = None
|
||||
|
||||
def get_mongo():
|
||||
global _mongo_client
|
||||
if _mongo_client is None:
|
||||
import pymongo
|
||||
_mongo_client = pymongo.MongoClient(MONGO_HOST, 27017, serverSelectionTimeoutMS=5000)
|
||||
return _mongo_client[MONGO_DB]
|
||||
|
||||
|
||||
def mongo_query_all(collection):
|
||||
try:
|
||||
return list(get_mongo()[collection].find({}, {"_id": 0}))
|
||||
except Exception as e:
|
||||
print(f" Mongo query error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def mongo_update_gitea_links(collection, name_value, links):
|
||||
try:
|
||||
get_mongo()[collection].update_one({"name": name_value}, {"$set": {"gitea_links": links}})
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f" Mongo update error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Link Gitea issues to homelab_infra MongoDB")
|
||||
parser.add_argument("--component", type=str, help="Only process this component name")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show matches without writing to DB")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=== Gitea → MongoDB infrastructure linker ===")
|
||||
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# Pre-fetch all issues and commits from all repos (batch to minimize API calls)
|
||||
print("\nFetching Gitea data...")
|
||||
all_issues = {}
|
||||
all_commits = {}
|
||||
|
||||
for repo in ALL_REPOS:
|
||||
open_issues = get_all_issues(repo, state="open")
|
||||
closed_issues = get_all_issues(repo, state="closed")
|
||||
all_issues[repo] = open_issues + closed_issues
|
||||
all_commits[repo] = get_recent_commits(repo, limit=30)
|
||||
print(f" {repo}: {len(all_issues[repo])} issues, {len(all_commits[repo])} commits")
|
||||
|
||||
# Process each collection
|
||||
total_linked = 0
|
||||
collections = ["services", "vms", "nodes", "storage"]
|
||||
|
||||
for collection in collections:
|
||||
docs = mongo_query_all(collection)
|
||||
if not docs:
|
||||
continue
|
||||
|
||||
print(f"\n[{collection}] {len(docs)} components...")
|
||||
|
||||
for doc in docs:
|
||||
name = doc.get("name", "")
|
||||
if not name:
|
||||
continue
|
||||
if args.component and args.component.lower() not in name.lower():
|
||||
continue
|
||||
|
||||
links = find_matches(doc, all_issues, all_commits)
|
||||
|
||||
issue_count = len(links.get("issues", []))
|
||||
commit_count = len(links.get("commits", []))
|
||||
|
||||
if issue_count or commit_count:
|
||||
print(f" {name}: {issue_count} issues, {commit_count} commits")
|
||||
if args.dry_run:
|
||||
for i in links.get("issues", []):
|
||||
print(f" ISSUE #{i['number']}: {i['title']} [{i['state']}]")
|
||||
for c in links.get("commits", []):
|
||||
print(f" COMMIT {c['sha']}: {c['message'][:60]}")
|
||||
else:
|
||||
mongo_update_gitea_links(collection, name, links)
|
||||
total_linked += 1
|
||||
else:
|
||||
# Still write empty links to clear stale data
|
||||
if not args.dry_run:
|
||||
mongo_update_gitea_links(collection, name, {
|
||||
"issues": [], "commits": [],
|
||||
"keywords_used": list(extract_keywords(doc))[:10],
|
||||
"linked_at": datetime.now(timezone.utc).isoformat(),
|
||||
})
|
||||
|
||||
if not args.dry_run:
|
||||
print(f"\n{total_linked} components linked to Gitea data")
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user