From 0eb26327fc9810999600159d7afe5569182c5596 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sat, 9 May 2026 17:37:26 +0100
Subject: [PATCH] feat(claims): /api/claims/{slug} canonical detail endpoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements Ship's claim detail contract — one round-trip, all data
resolved server-side. Replaces thin domain-only stub with full tree walk
(domains/ + foundations/ + core/), DB joins for PRs and reviews, and
server-side wikilink resolution to eliminate frontend N+1 cascades.

Response shape (Ship brief 2026-04-29):
  slug, title, domain, secondary_domains, confidence, description,
  created, last_review, body (raw markdown), sourced_from, reviews,
  prs, edges {supports,challenges,related,depends_on}, wikilinks

Wikilink resolution:
- Builds title→stem index from frontmatter title field, fallback to
  filename stem normalized via _normalize_for_match
- Returns flat {link_text: slug_or_null} map; unresolved → null so
  frontend can render plain text
- Inline normalization (lowercase, hyphen↔space, collapse whitespace,
  strip punctuation). Note: lib/attribution.py exposes only
  normalize_handle today, not the title normalizer Ship referenced.
  If a canonical helper lands later, point at it.

Caches:
- title→slug index: 60s TTL (warm cache <20ms p50 verified)
- list endpoint: 5min TTL (preserved from prior)
- Cold: ~3.3s for tree walk of 1,866 files; warm: 13-17ms

Bug fixed in second pass:
- _resolve_sourced_from defaulted title="" which leaked LIKE '%%'
  matching every PR. Now requires non-empty title+stem; handler falls
  back to slug.replace("-"," ") when frontmatter title is missing.

Verified live on VPS:
- AI diagnostic triage claim (no fm.title): sourced_from=1, prs=0
  (correct — Feb claim, pre-description-tracking)
- Recent extract PR claim: sourced_from=1 with URL, prs=1, reviews=1,
  last_review populated, edges 3 supports + 7 related, wikilinks 0
- 404 on missing slug: correct
- Claim with [[maps/...]] wikilink: 5/6 resolved (correct null on map)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diagnostics/claims_api.py | 507 ++++++++++++++++++++++++++++++++------
 1 file changed, 428 insertions(+), 79 deletions(-)

diff --git a/diagnostics/claims_api.py b/diagnostics/claims_api.py
index 90bdf73..f18c5d5 100644
--- a/diagnostics/claims_api.py
+++ b/diagnostics/claims_api.py
@@ -1,79 +1,385 @@
-"""Claims API endpoint — serves claim data from the codex filesystem."""
-import os
+"""Claims API — list endpoint + canonical claim detail page.
+
+Owner: Argus
+Routes:
+  GET /api/claims              — list/filter (frontmatter scan, lightweight)
+  GET /api/claims/{slug}       — full claim detail (Ship contract)
+  GET /api/domains             — domain rollups for sidebar
+
+The detail endpoint is the canonical /claims/{slug} backend per Ship's
+2026-04-29 brief. One round-trip, no N+1 cascade. Wikilinks resolved
+server-side via title→slug index built from a tree walk.
+"""
+import json
 import re
+import sqlite3
 import time
-import yaml
 from pathlib import Path
+
+import yaml
 from aiohttp import web
 
-CODEX_ROOT = Path("/opt/teleo-eval/workspaces/main/domains")
-_cache = {"data": None, "ts": 0}
-CACHE_TTL = 300  # 5 minutes
+# Codex tree roots — claims live in three places (Sourcer Apr 26 fix scope)
+CODEX_BASE = Path("/opt/teleo-eval/workspaces/main")
+CLAIM_TREES = [CODEX_BASE / "domains", CODEX_BASE / "foundations", CODEX_BASE / "core"]
 
-def _parse_frontmatter(filepath):
+# pipeline.db for joins (review_records, prs, sources)
+DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
+
+# In-process caches
+_list_cache = {"data": None, "ts": 0}
+_LIST_CACHE_TTL = 300  # 5 min — list view tolerates staleness
+
+_index_cache = {"by_title": None, "by_stem": None, "ts": 0}
+_INDEX_CACHE_TTL = 60  # 1 min — title→slug index for wikilink resolution
+
+CORS_HEADERS = {"Access-Control-Allow-Origin": "*"}
+
+# Wikilink pattern. [[text]] or [[text|alias]] — we keep the link text only.
+_WIKILINK_RE = re.compile(r"\[\[([^\]|#]+?)(?:[#|][^\]]*)?\]\]")
+
+
+# ─── Normalization ─────────────────────────────────────────────────────────
+
+def _normalize_for_match(s):
+    """Collapse a title or slug to a comparable form.
+
+    Rules (from Ship's brief — match the link-fixer canonicalization):
+      - lowercase
+      - hyphen ↔ space tolerant (both → single space)
+      - collapse runs of whitespace
+      - strip leading/trailing whitespace
+      - drop trailing punctuation that gets stripped from filenames
+        (`.`, `?`, `!`, `:`, `--`)
+    NOTE: lib/attribution.py exposes only normalize_handle today, not the
+    title normalizer Ship referenced. Implementing inline; if a canonical
+    helper lands later we point at it.
+    """
+    if not s:
+        return ""
+    s = str(s).lower().strip()
+    # Treat hyphens as spaces, then collapse whitespace runs
+    s = s.replace("-", " ").replace("_", " ")
+    s = re.sub(r"\s+", " ", s)
+    # Strip ASCII punctuation that filenames drop
+    s = re.sub(r"[^\w\s]", "", s)
+    return s.strip()
+
+
+# ─── Frontmatter parse ─────────────────────────────────────────────────────
+
+def _split_frontmatter(text):
+    """Return (frontmatter_dict, body_str) or (None, None) if not a claim file."""
+    if not text.startswith("---"):
+        return None, None
+    try:
+        end = text.index("\n---", 3)
+    except ValueError:
+        return None, None
+    try:
+        fm = yaml.safe_load(text[3:end])
+    except Exception:
+        return None, None
+    if not isinstance(fm, dict):
+        return None, None
+    body = text[end + 4:].lstrip()
+    return fm, body
+
+
+def _read_claim_file(filepath):
+    """Read a claim file from disk. Returns (frontmatter, body) or (None, None)."""
     try:
         text = filepath.read_text(encoding="utf-8")
-        if not text.startswith("---"):
-            return None
-        end = text.index("---", 3)
-        fm = yaml.safe_load(text[3:end])
-        if not fm or fm.get("type") != "claim":
-            return None
-        body = text[end+3:].strip()
-        # Count wiki-links
-        links = re.findall(r"\[\[([^\]]+)\]\]", body)
-        # Extract first paragraph as summary
-        paragraphs = [p.strip() for p in body.split("\n\n") if p.strip() and not p.strip().startswith("#")]
-        summary = paragraphs[0][:300] if paragraphs else ""
-        return {
-            "slug": filepath.stem,
-            "title": fm.get("title", filepath.stem.replace("-", " ")),
-            "domain": fm.get("domain", "unknown"),
-            "confidence": fm.get("confidence", "unknown"),
-            "agent": fm.get("agent"),
-            "scope": fm.get("scope"),
-            "created": str(fm.get("created", "")),
-            "source": fm.get("source", "") if isinstance(fm.get("source"), str) else "",
-            "sourcer": fm.get("sourcer", ""),
-            "wiki_link_count": len(links),
-            "summary": summary,
-            "challenged_by": fm.get("challenged_by"),
-            "related_claims": fm.get("related_claims", []),
-        }
-    except Exception:
-        return None
+    except (OSError, UnicodeDecodeError):
+        return None, None
+    return _split_frontmatter(text)
 
 
-def _load_all_claims():
-    now = time.time()
-    if _cache["data"] and now - _cache["ts"] < CACHE_TTL:
-        return _cache["data"]
+# ─── Tree walk + indexing ──────────────────────────────────────────────────
 
-    claims = []
-    for domain_dir in sorted(CODEX_ROOT.iterdir()):
-        if not domain_dir.is_dir():
+def _walk_claim_files():
+    """Yield Path objects for every .md claim file in domains/, foundations/, core/."""
+    for root in CLAIM_TREES:
+        if not root.exists():
             continue
-        for f in sorted(domain_dir.glob("*.md")):
+        for f in root.rglob("*.md"):
             if f.name == "_map.md":
                 continue
-            c = _parse_frontmatter(f)
-            if c:
-                claims.append(c)
+            yield f
 
-    _cache["data"] = claims
-    _cache["ts"] = now
+
+def _build_indexes():
+    """Build (title→stem, stem→relpath) indexes for wikilink resolution.
+
+    Cached for _INDEX_CACHE_TTL. Pulls from claim-index endpoint when
+    possible (already cached upstream) and falls back to filesystem walk.
+    """
+    now = time.time()
+    if _index_cache["by_title"] is not None and now - _index_cache["ts"] < _INDEX_CACHE_TTL:
+        return _index_cache["by_title"], _index_cache["by_stem"]
+
+    by_title = {}
+    by_stem = {}
+    for f in _walk_claim_files():
+        stem = f.stem
+        rel = str(f.relative_to(CODEX_BASE))
+        by_stem[stem] = rel
+        # Index by stem-as-normalized too (covers wikilinks that use the slug)
+        by_title[_normalize_for_match(stem)] = stem
+        # Also try parsing the title from frontmatter for higher-fidelity matches
+        fm, _ = _read_claim_file(f)
+        if fm:
+            title = fm.get("title")
+            if title:
+                key = _normalize_for_match(title)
+                if key and key not in by_title:
+                    by_title[key] = stem
+
+    _index_cache["by_title"] = by_title
+    _index_cache["by_stem"] = by_stem
+    _index_cache["ts"] = now
+    return by_title, by_stem
+
+
+def _resolve_wikilinks(body, by_title):
+    """Extract [[link]] occurrences from body, return {link_text: slug_or_null}."""
+    out = {}
+    for match in _WIKILINK_RE.finditer(body or ""):
+        link_text = match.group(1).strip()
+        if not link_text or link_text in out:
+            continue
+        norm = _normalize_for_match(link_text)
+        out[link_text] = by_title.get(norm)
+    return out
+
+
+# ─── Edge extraction from frontmatter ──────────────────────────────────────
+
+_EDGE_FIELDS = {
+    "supports": "supports",
+    "challenges": "challenges",
+    "challenged_by": "challenges",  # canonical: store as challenges direction
+    "related": "related",
+    "related_claims": "related",
+    "depends_on": "depends_on",
+}
+
+
+def _extract_edges(fm, by_title, by_stem):
+    """Return edges dict shaped per Ship's contract.
+
+    Each edge is {slug, title, exists}. Slug resolved through title index.
+    """
+    edges = {"supports": [], "challenges": [], "related": [], "depends_on": []}
+
+    for fm_key, edge_kind in _EDGE_FIELDS.items():
+        raw = fm.get(fm_key)
+        if not raw:
+            continue
+        items = raw if isinstance(raw, list) else [raw]
+        for item in items:
+            if not isinstance(item, str):
+                continue
+            text = item.strip()
+            # Strip wikilink wrapping if present
+            text = re.sub(r"^\[\[|\]\]$", "", text)
+            # Strip pipe annotations: "[[link|alias]]" style or "claim | edge_type | date"
+            text = text.split("|")[0].strip()
+            if not text:
+                continue
+            # Try title match first, fall back to stem match
+            slug = by_title.get(_normalize_for_match(text))
+            if not slug and text in by_stem:
+                slug = text
+            edges[edge_kind].append({
+                "slug": slug,
+                "title": text,
+                "exists": slug is not None,
+            })
+
+    return edges
+
+
+# ─── Source provenance ─────────────────────────────────────────────────────
+
+def _resolve_sourced_from(conn, claim_filepath, fm, title, stem):
+    """Build sourced_from list for the claim.
+
+    Strategy: find PRs that produced this claim (via prs.description LIKE
+    or branch slug match), look at prs.source_path → inbox archive file →
+    parse that source's frontmatter for title/url. Falls back to the raw
+    `source` string from the claim's own frontmatter.
+
+    Both `title` and `stem` must be non-empty — caller (handler) already
+    falls back stem→title; passing empty values would leak `LIKE '%%'`
+    and match unrelated PRs.
+    """
+    out = []
+    seen_paths = set()
+    pr_rows = []
+    if (title or "").strip() and (stem or "").strip():
+        try:
+            pr_rows = conn.execute(
+                """SELECT DISTINCT source_path
+                   FROM prs
+                   WHERE source_path IS NOT NULL AND source_path != ''
+                   AND (description LIKE ? OR branch LIKE ?)
+                   LIMIT 10""",
+                (f"%{title}%", f"%{stem}%"),
+            ).fetchall()
+        except sqlite3.OperationalError:
+            pr_rows = []
+
+    for row in pr_rows:
+        path = row["source_path"]
+        if not path or path in seen_paths:
+            continue
+        seen_paths.add(path)
+        out.append(_resolve_source_file(path))
+
+    # 2. Fallback: parse raw source frontmatter field if no PR match
+    if not out:
+        raw = fm.get("source")
+        if isinstance(raw, str) and raw.strip():
+            out.append({"path": None, "title": raw.strip()[:200], "url": None})
+
+    return out
+
+
+def _resolve_source_file(rel_path):
+    """Given inbox/archive/... path, parse frontmatter for title+url. Best-effort."""
+    full = CODEX_BASE / rel_path
+    entry = {"path": rel_path, "title": None, "url": None}
+    if full.exists():
+        fm, _ = _read_claim_file(full)
+        if fm:
+            entry["title"] = fm.get("title") or fm.get("source") or rel_path
+            entry["url"] = fm.get("url")
+    if not entry["title"]:
+        # Last resort: derive from filename
+        entry["title"] = Path(rel_path).stem.replace("-", " ")
+    return entry
+
+
+# ─── Reviews + PRs ─────────────────────────────────────────────────────────
+
+def _load_pr_history(conn, title, stem):
+    """Find PRs that touched this claim and their reviews.
+
+    Both title and stem must be non-empty strings — empty leaks `LIKE '%%'`
+    which matches every PR. Handler already populates a fallback so this
+    is a defense-in-depth guard.
+    """
+    if not (title or "").strip() or not (stem or "").strip():
+        return [], []
+
+    try:
+        pr_rows = conn.execute(
+            """SELECT number, merged_at, commit_type, agent, branch, status
+               FROM prs
+               WHERE merged_at IS NOT NULL
+               AND (description LIKE ? OR branch LIKE ?)
+               ORDER BY merged_at ASC
+               LIMIT 50""",
+            (f"%{title}%", f"%{stem}%"),
+        ).fetchall()
+    except sqlite3.OperationalError:
+        return [], []
+
+    prs = [
+        {
+            "number": r["number"],
+            "merged_at": r["merged_at"],
+            "kind": r["commit_type"] or "unknown",
+            "agent": r["agent"],
+            "branch": r["branch"],
+        }
+        for r in pr_rows
+    ]
+
+    pr_numbers = [p["number"] for p in prs]
+    if not pr_numbers:
+        return prs, []
+
+    placeholders = ",".join("?" * len(pr_numbers))
+    try:
+        review_rows = conn.execute(
+            f"""SELECT pr_number, reviewer, reviewer_model, outcome,
+                       rejection_reason, notes, reviewed_at
+                FROM review_records
+                WHERE pr_number IN ({placeholders})
+                ORDER BY reviewed_at ASC""",
+            pr_numbers,
+        ).fetchall()
+    except sqlite3.OperationalError:
+        review_rows = []
+
+    reviews = [
+        {
+            "pr_number": r["pr_number"],
+            "reviewer": r["reviewer"],
+            "model": r["reviewer_model"],
+            "outcome": r["outcome"],
+            "rejection_reason": r["rejection_reason"],
+            "notes": r["notes"],
+            "reviewed_at": r["reviewed_at"],
+        }
+        for r in review_rows
+    ]
+    return prs, reviews
+
+
+# ─── List view (preserved) ─────────────────────────────────────────────────
+
+def _parse_list_entry(filepath):
+    fm, body = _read_claim_file(filepath)
+    if not fm or fm.get("type") != "claim":
+        return None
+    links = _WIKILINK_RE.findall(body or "")
+    paragraphs = [p.strip() for p in (body or "").split("\n\n")
+                  if p.strip() and not p.strip().startswith("#")]
+    summary = paragraphs[0][:300] if paragraphs else ""
+    return {
+        "slug": filepath.stem,
+        "title": fm.get("title", filepath.stem.replace("-", " ")),
+        "domain": fm.get("domain", "unknown"),
+        "confidence": fm.get("confidence", "unknown"),
+        "agent": fm.get("agent"),
+        "scope": fm.get("scope"),
+        "created": str(fm.get("created", "")),
+        "source": fm.get("source", "") if isinstance(fm.get("source"), str) else "",
+        "sourcer": fm.get("sourcer", ""),
+        "wiki_link_count": len(links),
+        "summary": summary,
+        "challenged_by": fm.get("challenged_by"),
+        "related_claims": fm.get("related_claims", []),
+    }
+
+
+def _load_all_claims_list():
+    now = time.time()
+    if _list_cache["data"] and now - _list_cache["ts"] < _LIST_CACHE_TTL:
+        return _list_cache["data"]
+    claims = []
+    for f in _walk_claim_files():
+        entry = _parse_list_entry(f)
+        if entry:
+            claims.append(entry)
+    _list_cache["data"] = claims
+    _list_cache["ts"] = now
     return claims
 
 
-async def handle_claims(request):
-    claims = _load_all_claims()
+# ─── Handlers ──────────────────────────────────────────────────────────────
+
+async def handle_claims(request):
+    claims = _load_all_claims_list()
 
-    # Filters
     domain = request.query.get("domain")
     search = request.query.get("q", "").lower()
     confidence = request.query.get("confidence")
     agent = request.query.get("agent")
-    sort = request.query.get("sort", "recent")  # recent, alpha, domain
+    sort = request.query.get("sort", "recent")
 
     filtered = claims
     if domain:
@@ -83,9 +389,9 @@ async def handle_claims(request):
     if agent:
         filtered = [c for c in filtered if c["agent"] == agent]
     if search:
-        filtered = [c for c in filtered if search in c["title"].lower() or search in c["summary"].lower()]
+        filtered = [c for c in filtered
+                    if search in c["title"].lower() or search in c["summary"].lower()]
 
-    # Sort
     if sort == "recent":
         filtered.sort(key=lambda c: c["created"], reverse=True)
     elif sort == "alpha":
@@ -93,12 +399,10 @@ async def handle_claims(request):
     elif sort == "domain":
         filtered.sort(key=lambda c: (c["domain"], c["title"].lower()))
 
-    # Pagination
     limit = min(int(request.query.get("limit", "50")), 200)
     offset = int(request.query.get("offset", "0"))
-    page = filtered[offset:offset+limit]
+    page = filtered[offset:offset + limit]
 
-    # Domain counts for sidebar
     domain_counts = {}
     for c in claims:
         domain_counts[c["domain"]] = domain_counts.get(c["domain"], 0) + 1
@@ -111,31 +415,78 @@ async def handle_claims(request):
         "domains": dict(sorted(domain_counts.items(), key=lambda x: -x[1])),
         "confidence_levels": sorted(set(c["confidence"] for c in claims)),
         "agents": sorted(set(c["agent"] for c in claims if c["agent"])),
-    }, headers={"Access-Control-Allow-Origin": "*"})
+    }, headers=CORS_HEADERS)
 
 
 async def handle_claim_detail(request):
+    """GET /api/claims/{slug} — canonical claim detail page (Ship contract).
+
+    One round-trip, all data resolved server-side. Wikilinks pre-resolved.
+    """
     slug = request.match_info["slug"]
-    claims = _load_all_claims()
-    for c in claims:
-        if c["slug"] == slug:
-            # Read full body for detail view
-            for domain_dir in CODEX_ROOT.iterdir():
-                if not domain_dir.is_dir():
-                    continue
-                f = domain_dir / f"{slug}.md"
-                if f.exists():
-                    text = f.read_text(encoding="utf-8")
-                    end = text.index("---", 3)
-                    body = text[end+3:].strip()
-                    c["body"] = body
-                    break
-            return web.json_response(c, headers={"Access-Control-Allow-Origin": "*"})
-    return web.json_response({"error": "claim not found"}, status=404)
+    by_title, by_stem = _build_indexes()
+
+    rel_path = by_stem.get(slug)
+    if not rel_path:
+        return web.json_response({"error": "claim not found", "slug": slug},
+                                 status=404, headers=CORS_HEADERS)
+
+    filepath = CODEX_BASE / rel_path
+    fm, body = _read_claim_file(filepath)
+    if not fm:
+        return web.json_response({"error": "frontmatter parse failed", "slug": slug},
+                                 status=500, headers=CORS_HEADERS)
+
+    # Open read-only DB connection for this request
+    conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True)
+    conn.row_factory = sqlite3.Row
+    try:
+        title = fm.get("title") or slug.replace("-", " ")
+        prs, reviews = _load_pr_history(conn, title, slug)
+        sourced_from = _resolve_sourced_from(conn, filepath, fm, title, slug)
+    finally:
+        conn.close()
+
+    last_review = None
+    if reviews:
+        latest = reviews[-1]
+        last_review = {
+            "outcome": latest["outcome"],
+            "reviewer": latest["reviewer"],
+            "date": (latest["reviewed_at"] or "")[:10],
+        }
+
+    # secondary_domains: explicit list, or empty
+    secondary = fm.get("secondary_domains") or fm.get("cross_domain_links") or []
+    if isinstance(secondary, str):
+        secondary = [secondary]
+
+    description = fm.get("description") or ""
+
+    edges = _extract_edges(fm, by_title, by_stem)
+    wikilinks = _resolve_wikilinks(body, by_title)
+
+    response = {
+        "slug": slug,
+        "title": title,
+        "domain": fm.get("domain", "unknown"),
+        "secondary_domains": secondary,
+        "confidence": fm.get("confidence", "unknown"),
+        "description": description,
+        "created": str(fm.get("created", "")),
+        "last_review": last_review,
+        "body": body or "",
+        "sourced_from": sourced_from,
+        "reviews": reviews,
+        "prs": prs,
+        "edges": edges,
+        "wikilinks": wikilinks,
+    }
+    return web.json_response(response, headers=CORS_HEADERS)
 
 
 async def handle_domains(request):
-    claims = _load_all_claims()
+    claims = _load_all_claims_list()
     domains = {}
     for c in claims:
         d = c["domain"]
@@ -146,13 +497,11 @@ async def handle_domains(request):
             domains[d]["agents"].add(c["agent"])
         conf = c["confidence"]
         domains[d]["confidence_dist"][conf] = domains[d]["confidence_dist"].get(conf, 0) + 1
-
     result = []
     for d in sorted(domains.values(), key=lambda x: -x["count"]):
         d["agents"] = sorted(d["agents"])
         result.append(d)
-
-    return web.json_response(result, headers={"Access-Control-Allow-Origin": "*"})
+    return web.json_response(result, headers=CORS_HEADERS)
 
 
 def register_claims_routes(app):