Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source of truth. Previously only 8 of 67 files existed in repo — the rest were deployed directly to VPS via SCP, causing massive drift. Includes: - pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.) - pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh - diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics) - agent-state/: bootstrap, lib-state, cascade inbox processor, schema - systemd/: service unit files for reference - deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate - research-session.sh: updated with Step 8.5 digest + cascade inbox processing No new code written — all files are exact copies from VPS as of 2026-04-06. From this point forward: edit in repo, commit, then deploy.sh. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
196 lines
6.3 KiB
Python
196 lines
6.3 KiB
Python
"""Claim index generator — structured index of all KB claims.
|
|
|
|
Produces claim-index.json: every claim with title, domain, confidence,
|
|
wiki links (outgoing + incoming counts), created date, word count,
|
|
challenged_by status. Consumed by:
|
|
- Argus (diagnostics dashboard — charts, vital signs)
|
|
- Vida (KB health diagnostics — orphan ratio, linkage density, freshness)
|
|
- Extraction prompt (KB index for dedup — could replace /tmp/kb-indexes/)
|
|
|
|
Generated after each merge (post-merge hook) or on demand.
|
|
Served via GET /claim-index on the health API.
|
|
|
|
Epimetheus owns this module.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import date, datetime
|
|
from pathlib import Path
|
|
|
|
from . import config
|
|
|
|
logger = logging.getLogger("pipeline.claim_index")
|
|
|
|
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
|
|
|
|
|
def _parse_frontmatter(text: str) -> dict | None:
|
|
"""Quick YAML frontmatter parser."""
|
|
if not text.startswith("---"):
|
|
return None
|
|
end = text.find("---", 3)
|
|
if end == -1:
|
|
return None
|
|
raw = text[3:end]
|
|
|
|
try:
|
|
import yaml
|
|
fm = yaml.safe_load(raw)
|
|
return fm if isinstance(fm, dict) else None
|
|
except ImportError:
|
|
pass
|
|
except Exception:
|
|
return None
|
|
|
|
# Fallback parser
|
|
fm = {}
|
|
for line in raw.strip().split("\n"):
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
if ":" not in line:
|
|
continue
|
|
key, _, val = line.partition(":")
|
|
key = key.strip()
|
|
val = val.strip().strip('"').strip("'")
|
|
if val.lower() == "null" or val == "":
|
|
val = None
|
|
fm[key] = val
|
|
return fm if fm else None
|
|
|
|
|
|
def build_claim_index(repo_root: str | None = None) -> dict:
|
|
"""Build the full claim index from the repo.
|
|
|
|
Returns {generated_at, total_claims, claims: [...], domains: {...}}
|
|
"""
|
|
base = Path(repo_root) if repo_root else config.MAIN_WORKTREE
|
|
claims = []
|
|
all_stems: dict[str, str] = {} # stem → filepath (for incoming link counting)
|
|
|
|
# Phase 1: Collect all claims with outgoing links
|
|
for subdir in ["domains", "core", "foundations", "decisions"]:
|
|
full = base / subdir
|
|
if not full.is_dir():
|
|
continue
|
|
for f in full.rglob("*.md"):
|
|
if f.name.startswith("_"):
|
|
continue
|
|
|
|
try:
|
|
content = f.read_text()
|
|
except Exception:
|
|
continue
|
|
|
|
fm = _parse_frontmatter(content)
|
|
if fm is None:
|
|
continue
|
|
|
|
ftype = fm.get("type")
|
|
if ftype not in ("claim", "framework", None):
|
|
continue # Skip entities, sources, etc.
|
|
|
|
# Extract wiki links
|
|
body_start = content.find("---", 3)
|
|
body = content[body_start + 3:] if body_start > 0 else content
|
|
outgoing_links = [link.strip() for link in WIKI_LINK_RE.findall(body) if link.strip()]
|
|
|
|
# Relative path from repo root
|
|
rel_path = str(f.relative_to(base))
|
|
|
|
# Word count (body only, not frontmatter)
|
|
body_text = re.sub(r"^# .+\n", "", body).strip()
|
|
body_text = re.split(r"\n---\n", body_text)[0] # Before Relevant Notes
|
|
word_count = len(body_text.split())
|
|
|
|
# Check for challenged_by
|
|
has_challenged_by = bool(fm.get("challenged_by"))
|
|
|
|
# Created date
|
|
created = fm.get("created")
|
|
if isinstance(created, date):
|
|
created = created.isoformat()
|
|
|
|
claim = {
|
|
"file": rel_path,
|
|
"stem": f.stem,
|
|
"title": f.stem.replace("-", " "),
|
|
"domain": fm.get("domain", subdir),
|
|
"confidence": fm.get("confidence"),
|
|
"created": created,
|
|
"outgoing_links": outgoing_links,
|
|
"outgoing_count": len(outgoing_links),
|
|
"incoming_count": 0, # Computed in phase 2
|
|
"has_challenged_by": has_challenged_by,
|
|
"word_count": word_count,
|
|
"type": ftype or "claim",
|
|
}
|
|
claims.append(claim)
|
|
all_stems[f.stem] = rel_path
|
|
|
|
# Phase 2: Count incoming links
|
|
incoming_counts: dict[str, int] = {}
|
|
for claim in claims:
|
|
for link in claim["outgoing_links"]:
|
|
if link in all_stems:
|
|
incoming_counts[link] = incoming_counts.get(link, 0) + 1
|
|
|
|
for claim in claims:
|
|
claim["incoming_count"] = incoming_counts.get(claim["stem"], 0)
|
|
|
|
# Domain summary
|
|
domain_counts: dict[str, int] = {}
|
|
for claim in claims:
|
|
d = claim["domain"]
|
|
domain_counts[d] = domain_counts.get(d, 0) + 1
|
|
|
|
# Orphan detection (0 incoming links)
|
|
orphans = sum(1 for c in claims if c["incoming_count"] == 0)
|
|
|
|
# Cross-domain links
|
|
cross_domain_links = 0
|
|
for claim in claims:
|
|
claim_domain = claim["domain"]
|
|
for link in claim["outgoing_links"]:
|
|
if link in all_stems:
|
|
# Find the linked claim's domain
|
|
for other in claims:
|
|
if other["stem"] == link and other["domain"] != claim_domain:
|
|
cross_domain_links += 1
|
|
break
|
|
|
|
index = {
|
|
"generated_at": datetime.utcnow().isoformat() + "Z",
|
|
"total_claims": len(claims),
|
|
"domains": domain_counts,
|
|
"orphan_count": orphans,
|
|
"orphan_ratio": round(orphans / len(claims), 3) if claims else 0,
|
|
"cross_domain_links": cross_domain_links,
|
|
"claims": claims,
|
|
}
|
|
|
|
return index
|
|
|
|
|
|
def write_claim_index(repo_root: str | None = None, output_path: str | None = None) -> str:
|
|
"""Build and write claim-index.json. Returns the output path."""
|
|
index = build_claim_index(repo_root)
|
|
|
|
if output_path is None:
|
|
output_path = str(Path.home() / ".pentagon" / "workspace" / "collective" / "claim-index.json")
|
|
|
|
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Atomic write
|
|
tmp = output_path + ".tmp"
|
|
with open(tmp, "w") as f:
|
|
json.dump(index, f, indent=2)
|
|
import os
|
|
os.rename(tmp, output_path)
|
|
|
|
logger.info("Wrote claim-index.json: %d claims, %d orphans, %d cross-domain links",
|
|
index["total_claims"], index["orphan_count"], index["cross_domain_links"])
|
|
|
|
return output_path
|