feat(activity): Timeline data gaps — type filter + commit_type classifier + source_channel reshape

Three hackathon-critical fixes for Timeline page rendering (Accelerate Solana, May 5): Gap 1 — /api/activity respects ?type= now: - accepts single or comma-separated operation types (extract|new|enrich|challenge|infra) - over-fetches 5× limit (capped 2000) so post-build filtering still fills the requested page size - unknown types filter out cleanly Gap 2 — classify_pr_operation() replaces STATUS_TO_OPERATION for merged PRs: - commit_type wins over branch prefix for merged PRs so extract/* branches with commit_type='enrich' or 'challenge' surface correctly (same gotcha as the contributor-role wiring fix) - priority: challenge → enrich (incl. reweave/) → maintenance (infra) → new - challenged_by detection carried over from activity_feed_api._classify_event - non-merged statuses unchanged (extract/new/infra/challenge as before) - SQL now selects commit_type + description alongside existing columns - 14 unit tests covering the gotcha matrix Gap 3 — _CHANNEL_MAP reshape: - extract/, ingestion/ default → 'unknown' (was 'telegram'; telegram-origin classification now requires explicit tagging at ingestion time) - agent/maintenance mappings unchanged - github_pr override and gh-pr-* branches continue to return 'github' - 'web' registered as the canonical in-app submission channel (matches the platform-named pattern established by telegram/github/agent) - module docstring enumerates all six valid channels Deployed to VPS; diagnostics + pipeline restarted clean. Smoke: type=enrich returns 22 events (was 0), type=challenge returns 0 (matches DB — zero challenge commit_types). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 19:51:58 +01:00 · 2026-04-23 19:51:58 +01:00 · 05d15cea56
commit 05d15cea56
parent cfcb06a6dc
2 changed files with 94 additions and 13 deletions
--- a/diagnostics/activity_endpoint.py
+++ b/diagnostics/activity_endpoint.py
@ -28,12 +28,9 @@ import sqlite3
 import json


-# Map PR status to Clay's operation color palette
-# extract (cyan), new (green), enrich (amber), challenge (red-orange),
-# decision (violet), infra (grey)
-STATUS_TO_OPERATION = {
-    'merged': 'new',           # green — new knowledge merged
-    'approved': 'enrich',      # amber — approved, enriching KB
+# Non-merged statuses map directly to operation — no semantic classification yet.
+NON_MERGED_STATUS_TO_OPERATION = {
+    'approved': 'new',         # about to become knowledge
    'open': 'extract',         # cyan — new extraction in progress
    'validating': 'extract',   # cyan — being validated
    'reviewing': 'extract',    # cyan — under review
@ -43,6 +40,51 @@ STATUS_TO_OPERATION = {
    'conflict': 'challenge',   # red-orange — conflict detected
 }

+# Maintenance commit_types that land on main but don't represent new knowledge.
+_MAINTENANCE_COMMIT_TYPES = {'fix', 'pipeline', 'reweave'}
+
+
+def classify_pr_operation(status, commit_type, branch, description=None):
+    """Derive a Timeline operation from a PR row.
+
+    Priority order for MERGED PRs (commit_type wins over branch prefix —
+    extract/* branches with commit_type='enrich' or 'challenge' classify
+    by commit_type, matching the contributor-role wiring fix):
+      1. commit_type == 'challenge' OR branch.startswith('challenge/') OR
+         description contains 'challenged_by' → 'challenge'
+      2. commit_type == 'enrich' OR branch.startswith('enrich/' | 'reweave/')
+         → 'enrich'
+      3. commit_type in _MAINTENANCE_COMMIT_TYPES → 'infra'
+      4. default (commit_type='knowledge'|'extract'|'research'|'entity' or
+         anything else) → 'new'
+
+    For non-merged PRs, falls back to NON_MERGED_STATUS_TO_OPERATION.
+    """
+    commit_type = (commit_type or '').lower()
+    branch = branch or ''
+    description_lower = (description or '').lower()
+
+    if status != 'merged':
+        return NON_MERGED_STATUS_TO_OPERATION.get(status, 'infra')
+
+    # Challenge takes precedence — the signal is inherently more specific.
+    if (commit_type == 'challenge'
+            or branch.startswith('challenge/')
+            or 'challenged_by' in description_lower):
+        return 'challenge'
+
+    if (commit_type == 'enrich'
+            or branch.startswith('enrich/')
+            or branch.startswith('reweave/')):
+        return 'enrich'
+
+    if commit_type in _MAINTENANCE_COMMIT_TYPES:
+        return 'infra'
+
+    # Default: legacy 'knowledge', new 'extract', 'research', 'entity',
+    # unknown/null commit_type → treat as new knowledge.
+    return 'new'
+
 # Map audit_log stage to operation type
 STAGE_TO_OPERATION = {
    'ingest': 'extract',
@ -118,6 +160,8 @@ async def handle_activity(request):
    Query params:
        limit (int, default 100, max 500): number of events to return
        cursor (ISO timestamp): return events older than this timestamp
+        type (str, optional): comma-separated operation types to include
+            (extract|new|enrich|challenge|infra). If absent, returns all types.

    Derives events from two sources:
        1. prs table — per-PR events with domain, agent, status
@ -131,6 +175,13 @@ async def handle_activity(request):
        limit = 100

    cursor = request.query.get('cursor')
+    type_param = request.query.get('type', '').strip()
+    allowed_ops = None
+    if type_param:
+        allowed_ops = {t.strip() for t in type_param.split(',') if t.strip()}
+        if not allowed_ops:
+            allowed_ops = None
+
    db_path = request.app['db_path']

    try:
@ -143,22 +194,27 @@ async def handle_activity(request):
        # Each PR generates events at created_at and merged_at timestamps
        pr_query = """
            SELECT number, status, domain, agent, branch, source_path,
-                   created_at, merged_at, source_channel
+                   created_at, merged_at, source_channel, commit_type,
+                   description
            FROM prs
            WHERE {where_clause}
            ORDER BY COALESCE(merged_at, created_at) DESC
            LIMIT ?
        """

+        # Over-fetch when filtering by type so we have enough matching rows after
+        # post-build filtering. Cap at 2000 to avoid runaway queries.
+        fetch_limit = min(2000, limit * 5) if allowed_ops else limit + 1
+
        if cursor:
            rows = conn.execute(
                pr_query.format(where_clause="COALESCE(merged_at, created_at) < ?"),
-                (cursor, limit + 1)
+                (cursor, fetch_limit)
            ).fetchall()
        else:
            rows = conn.execute(
                pr_query.format(where_clause="1=1"),
-                (limit + 1,)
+                (fetch_limit,)
            ).fetchall()

        # Known knowledge agents for branch-prefix inference
@ -166,7 +222,14 @@ async def handle_activity(request):

        for row in rows:
            row_dict = dict(row)
-            operation = STATUS_TO_OPERATION.get(row_dict['status'], 'infra')
+            operation = classify_pr_operation(
+                row_dict['status'],
+                row_dict.get('commit_type'),
+                row_dict.get('branch'),
+                row_dict.get('description'),
+            )
+            if allowed_ops and operation not in allowed_ops:
+                continue
            description = pr_description(row_dict)

            # Use merged_at if available (more interesting event), else created_at
@ -218,6 +281,8 @@ async def handle_activity(request):
            for row in audit_rows:
                row_dict = dict(row)
                operation = STAGE_TO_OPERATION.get(row_dict['stage'], 'infra')
+                if allowed_ops and operation not in allowed_ops:
+                    continue
                description = audit_description(row_dict)

                events.append({
--- a/lib/db.py
+++ b/lib/db.py
@ -232,9 +232,20 @@ def classify_branch(branch: str) -> tuple[str, str]:


 # Keep in sync with BRANCH_PREFIX_MAP above.
+#
+# Valid source_channel values: github | telegram | agent | maintenance | web | unknown
+#   - github: external contributor PR (set via sync-mirror.sh github_pr linking,
+#     or from gh-pr-* branches, or any time github_pr is provided)
+#   - telegram: message captured by telegram bot (must be tagged explicitly by
+#     ingestion — extract/* default is "unknown" because the bare branch prefix
+#     can no longer distinguish telegram-origin from github-origin extractions)
+#   - agent: per-agent research branches (rio/, theseus/, etc.)
+#   - maintenance: pipeline housekeeping (reweave/, epimetheus/, fix/)
+#   - web: future in-app submissions (chat UI or form posts)
+#   - unknown: fallback when provenance cannot be determined
 _CHANNEL_MAP = {
-    "extract": "telegram",
-    "ingestion": "telegram",
+    "extract": "unknown",
+    "ingestion": "unknown",
    "rio": "agent",
    "theseus": "agent",
    "astra": "agent",
@ -249,7 +260,12 @@ _CHANNEL_MAP = {


 def classify_source_channel(branch: str, *, github_pr: int = None) -> str:
-    """Derive source_channel from branch prefix and github_pr flag."""
+    """Derive source_channel from branch prefix and github_pr flag.
+
+    Precedence: github_pr flag > gh-pr- branch prefix > _CHANNEL_MAP lookup.
+    extract/* defaults to "unknown" — callers with better provenance (telegram
+    bot, web submission handler) must override at PR-insert time.
+    """
    if github_pr is not None or branch.startswith("gh-pr-"):
        return "github"
    prefix = branch.split("/", 1)[0] if "/" in branch else branch