diff --git a/diagnostics/claims_api.py b/diagnostics/claims_api.py index 712a46b..c5d5c5c 100644 --- a/diagnostics/claims_api.py +++ b/diagnostics/claims_api.py @@ -10,7 +10,9 @@ The detail endpoint is the canonical /claims/{slug} backend per Ship's 2026-04-29 brief. One round-trip, no N+1 cascade. Wikilinks resolved server-side via title→slug index built from a tree walk. """ +import asyncio import json +import logging import re import sqlite3 import time @@ -19,6 +21,8 @@ from pathlib import Path import yaml from aiohttp import web +logger = logging.getLogger("argus.claims") + # Codex tree roots — claims live in three places (Sourcer Apr 26 fix scope) CODEX_BASE = Path("/opt/teleo-eval/workspaces/main") CLAIM_TREES = [CODEX_BASE / "domains", CODEX_BASE / "foundations", CODEX_BASE / "core"] @@ -31,7 +35,14 @@ _list_cache = {"data": None, "ts": 0} _LIST_CACHE_TTL = 300 # 5 min — list view tolerates staleness _index_cache = {"by_title": None, "by_stem": None, "ts": 0} -_INDEX_CACHE_TTL = 60 # 1 min — title→slug index for wikilink resolution +_INDEX_CACHE_TTL = 300 # 5 min — title→slug index for wikilink resolution + +# Minimum normalized-stem length for prefix-fallback resolution. +# Stems shorter than this are too generic to be unambiguous in the prefix +# space (e.g. a "rio" stem would match any request starting with "rio"). +# Proper-prefix matching is much stronger than common-prefix at preventing +# spurious hits, so this can be lower than the original common-prefix anchor. +_PREFIX_ANCHOR_MIN = 16 CORS_HEADERS = {"Access-Control-Allow-Origin": "*"} @@ -71,12 +82,16 @@ def _normalize_for_match(s): _CODE_FENCE_WRAPPER_RE = re.compile(r"^\s*```(?:markdown|md)?\s*\n(.*?)\n```\s*$", re.DOTALL) -def _split_frontmatter(text): +def _split_frontmatter(text, filepath=None): """Return (frontmatter_dict, body_str) or (None, None) if not a claim file. Tolerates files wrapped in a top-level ```markdown ... ``` code fence — some agents have produced these (e.g. Montreal Protocol claim from Astra, 2024-12-09). Unwrap once before frontmatter detection. + + YAML parse failures are logged at WARNING with the file path (when + provided) so KB integrity drift surfaces in logs rather than silently + becoming 404s on the detail endpoint. """ if not text: return None, None @@ -92,7 +107,8 @@ def _split_frontmatter(text): return None, None try: fm = yaml.safe_load(text[3:end]) - except Exception: + except yaml.YAMLError as e: + logger.warning("YAML parse failed in %s: %s", filepath or "", e) return None, None if not isinstance(fm, dict): return None, None @@ -106,7 +122,7 @@ def _read_claim_file(filepath): text = filepath.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): return None, None - return _split_frontmatter(text) + return _split_frontmatter(text, filepath) # ─── Tree walk + indexing ────────────────────────────────────────────────── @@ -438,7 +454,12 @@ async def handle_claim_detail(request): One round-trip, all data resolved server-side. Wikilinks pre-resolved. """ requested_slug = request.match_info["slug"] - by_title, by_stem = _build_indexes() + # Cold-cache rebuild walks ~1,900 files (~3.3s of sync I/O). Route through + # to_thread so the aiohttp event loop stays responsive while the index + # rebuilds — concurrent requests don't all stall behind one walk. + # Warm-cache cost is a dict access (microseconds), to_thread overhead + # negligible. Ganymede review 2026-05-11. + by_title, by_stem = await asyncio.to_thread(_build_indexes) # Resolution order: exact stem → title-normalized (handles description-derived # slugs from /api/activity-feed that are longer than on-disk file stems) → @@ -454,21 +475,22 @@ async def handle_claim_detail(request): slug = resolved_stem rel_path = by_stem.get(resolved_stem) if not rel_path: - # Prefix fallback: walk stems sharing a common prefix with the request, - # pick longest match. Anchored at 32 chars to avoid spurious hits. + # Proper-prefix fallback: the requested slug should START WITH a known + # stem (covers activity-feed slugs longer than on-disk filenames). The + # earlier common-prefix variant of this was broken in two directions — + # served non-deterministic matches on same-prefix collisions, and + # missed legitimate matches under the 32-char anchor (e.g. stems + # shorter than 32 chars normalized). Ganymede review 2026-05-11. norm_req = _normalize_for_match(requested_slug) best_stem = None best_len = 0 for stem in by_stem: norm_stem = _normalize_for_match(stem) - common = 0 - for a, b in zip(norm_req, norm_stem): - if a != b: - break - common += 1 - if common >= 32 and common > best_len: + if len(norm_stem) < _PREFIX_ANCHOR_MIN: + continue # too generic in prefix space + if norm_req.startswith(norm_stem) and len(norm_stem) > best_len: best_stem = stem - best_len = common + best_len = len(norm_stem) if best_stem: slug = best_stem rel_path = by_stem.get(best_stem)