fix(activity): re-apply source classifier + add date-prefix slug fallback

Regression: aeae712's source/create distinction was lost — VPS reverted to pre-aeae712 behavior where every extract/* knowledge PR returned type=create regardless of whether a claim was written. Source archives surfaced as "New claim" chips with date-prefix slugs that 404 on click. Root cause: aeae712 was deployed via local file copy and never pushed to origin; a subsequent rsync from origin/main overwrote it with the older classifier. This branch ships from origin so deploy.sh's repo-first gate makes recurrence impossible. - Restore aeae712: extract/* + empty description -> source, with empty claim_slug + source_slug field, ci_earned 0.15 - Add Leo's regex fallback: candidate_slug matching ^\d{4}-\d{2}-\d{2}-.+-[a-f0-9]{4}$ -> source regardless of branch /commit_type/description state. Catches edge cases where description leaks but is just a source title (slugified into the inbox filename pattern), not a claim insight. - Add 'challenge' to _FEED_COMMIT_TYPES (latent bug — challenge PRs would be filtered out before classification because the filter list omitted them; memory says 0 challenges exist so it never triggered, but schema support belongs in the filter) - _build_events: compute candidate slug before classify so the regex fallback has a slug to inspect Verified locally on Leo's example PRs (#4014, #4016) — both classify as source. VPS smoke pending deploy. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 13:47:00 +01:00 · 2026-04-26 13:47:00 +01:00 · 926a397839
commit 926a397839
parent 3fe524dd14
1 changed files with 93 additions and 19 deletions
--- a/diagnostics/activity_feed_api.py
+++ b/diagnostics/activity_feed_api.py
@ -9,6 +9,16 @@ DB_PATH = "/opt/teleo-eval/pipeline/pipeline.db"
 _cache = {"data": None, "ts": 0}
 CACHE_TTL = 60  # 1 minute — activity should feel fresh
 # commit_types we surface in the activity feed. `pipeline` is system
 # maintenance (reweave/fix auto-runs, zombie cleanup) and stays hidden.
 _FEED_COMMIT_TYPES = ("knowledge", "enrich", "challenge", "research", "entity", "extract", "reweave")
 # Source-archive slugs follow YYYY-MM-DD-publisher-topic-HASH4 — they're
 # inbox archive filenames, not claim slugs. Used as a fallback signal when
 # branch/description heuristics miss (e.g. populated descriptions that
 # happen to be source titles, not claim insights).
 _SOURCE_SLUG_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}-.+-[a-f0-9]{4}$")
 def _get_conn():
    conn = sqlite3.connect(DB_PATH)
@ -17,19 +27,52 @@ def _get_conn():
    return conn
-def _classify_event(branch, description, commit_type):
+def _is_source_slug(slug):
-    if commit_type != "knowledge":
+    return bool(slug and _SOURCE_SLUG_PATTERN.match(slug))
 def _classify_event(branch, description, commit_type, candidate_slug=None):
    """Return one of: create | enrich | challenge | source | None.
    Source-archive PRs are extract/* branches that filed a source into
    inbox/archive/ but didn't produce a claim. Two signals classify them
    as 'source' (defense in depth):
      1. extract/* branch with empty description (no claim title produced)
      2. candidate_slug matches YYYY-MM-DD-...-HASH4 (inbox filename pattern)
    """
    commit_type_l = (commit_type or "").lower()
    branch = branch or ""
    description_lower = (description or "").lower()
    has_desc = bool(description and description.strip())
    if commit_type_l not in _FEED_COMMIT_TYPES:
        return None
-    if branch and branch.startswith("extract/"):
+
-        return "create"
+    # Explicit challenge signals win first.
-    if branch and branch.startswith("reweave/"):
+    if (commit_type_l == "challenge"
-        return "enrich"
+            or branch.startswith("challenge/")
-    if branch and branch.startswith("challenge/"):
+            or "challenged_by" in description_lower):
        return "challenge"
-    if description and "challenged_by" in description.lower():
+
-        return "challenge"
+    # Enrichment: reweave edge-connects, enrich/ branches, or commit_type=enrich.
-    if branch and branch.startswith("enrich/"):
+    if (commit_type_l == "enrich"
            or branch.startswith("enrich/")
            or branch.startswith("reweave/")):
        return "enrich"
    # Source-only: extract/* with no claim description means inbox archive
    # landed but no domain claim was written.
    if branch.startswith("extract/") and not has_desc:
        return "source"
    # Belt-and-suspenders: if the slug we'd surface to the frontend looks
    # like an inbox archive filename (date-prefix-hash), treat as source
    # regardless of branch/commit_type/description state. Catches cases
    # where description leaked but is just a source title, not a claim.
    if _is_source_slug(candidate_slug):
        return "source"
    # Everything else with a description is a new claim.
    return "create"
@ -81,33 +124,60 @@ def _hot_score(challenge_count, enrich_count, signal_count, hours_since):
 def _build_events():
    conn = _get_conn()
    try:
-        rows = conn.execute("""
+        placeholders = ",".join("?" * len(_FEED_COMMIT_TYPES))
        rows = conn.execute(f"""
            SELECT p.number, p.branch, p.domain, p.agent, p.submitted_by,
                   p.merged_at, p.description, p.commit_type, p.cost_usd,
-                   p.source_channel
+                   p.source_channel, p.source_path
            FROM prs p
            WHERE p.status = 'merged'
-              AND p.commit_type = 'knowledge'
+              AND p.commit_type IN ({placeholders})
              AND p.merged_at IS NOT NULL
            ORDER BY p.merged_at DESC
            LIMIT 2000
-        """).fetchall()
+        """, _FEED_COMMIT_TYPES).fetchall()
        events = []
        claim_activity = {}  # slug -> {challenges, enriches, signals, first_seen}
        for row in rows:
-            event_type = _classify_event(row["branch"], row["description"], row["commit_type"])
+            slugs = _extract_claim_slugs(row["description"], row["branch"])
            candidate_slug = slugs[0] if slugs else ""
            event_type = _classify_event(
                row["branch"], row["description"], row["commit_type"],
                candidate_slug=candidate_slug,
            )
            if not event_type:
                continue
            contributor = _normalize_contributor(row["submitted_by"], row["agent"])
            slugs = _extract_claim_slugs(row["description"], row["branch"])
            merged_at = row["merged_at"] or ""
-            ci_map = {"create": 0.35, "enrich": 0.25, "challenge": 0.40}
+            ci_map = {"create": 0.35, "enrich": 0.25, "challenge": 0.40, "source": 0.15}
            ci_earned = ci_map.get(event_type, 0)
            # Source events never carry a claim_slug — no claim was written —
            # so the frontend can't produce a 404-ing claim link.
            if event_type == "source":
                summary_text = _summary_from_branch(row["branch"])
                source_slug = (
                    _summary_from_branch(row["branch"]).lower().replace(" ", "-")
                    or row["branch"]
                )
                events.append({
                    "type": "source",
                    "claim_slug": "",
                    "source_slug": source_slug,
                    "domain": row["domain"] or "unknown",
                    "contributor": contributor,
                    "timestamp": merged_at,
                    "ci_earned": round(ci_earned, 2),
                    "summary": summary_text,
                    "pr_number": row["number"],
                    "source_channel": row["source_channel"] or "unknown",
                })
                continue
            for slug in slugs:
                if slug not in claim_activity:
                    claim_activity[slug] = {
@ -164,8 +234,8 @@ def _sort_events(events, claim_activity, sort_mode, now_ts):
            return _hot_score(ca["challenges"], ca["enriches"], ca["signals"], hours)
        events.sort(key=hot_key, reverse=True)
    elif sort_mode == "important":
-        type_rank = {"challenge": 0, "enrich": 1, "create": 2}
+        type_rank = {"challenge": 0, "enrich": 1, "create": 2, "source": 3}
-        events.sort(key=lambda e: (type_rank.get(e["type"], 3), -len(e["summary"])))
+        events.sort(key=lambda e: (type_rank.get(e["type"], 4), -len(e["summary"])))
    return events
@ -175,6 +245,8 @@ async def handle_activity_feed(request):
        sort_mode = "recent"
    domain = request.query.get("domain", "")
    contributor = request.query.get("contributor", "")
    type_param = request.query.get("type", "")
    type_filter = {t.strip() for t in type_param.split(",") if t.strip()} if type_param else None
    try:
        limit = min(int(request.query.get("limit", "20")), 100)
    except ValueError:
@ -196,6 +268,8 @@ async def handle_activity_feed(request):
        filtered = [e for e in filtered if e["domain"] == domain]
    if contributor:
        filtered = [e for e in filtered if e["contributor"] == contributor]
    if type_filter:
        filtered = [e for e in filtered if e["type"] in type_filter]
    sorted_events = _sort_events(list(filtered), claim_activity, sort_mode, now)
    total = len(sorted_events)