From 05d15cea56614fa2e3432a083e9c6bd854656200 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 23 Apr 2026 19:51:58 +0100 Subject: [PATCH] =?UTF-8?q?feat(activity):=20Timeline=20data=20gaps=20?= =?UTF-8?q?=E2=80=94=20type=20filter=20+=20commit=5Ftype=20classifier=20+?= =?UTF-8?q?=20source=5Fchannel=20reshape?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three hackathon-critical fixes for Timeline page rendering (Accelerate Solana, May 5): Gap 1 — /api/activity respects ?type= now: - accepts single or comma-separated operation types (extract|new|enrich|challenge|infra) - over-fetches 5× limit (capped 2000) so post-build filtering still fills the requested page size - unknown types filter out cleanly Gap 2 — classify_pr_operation() replaces STATUS_TO_OPERATION for merged PRs: - commit_type wins over branch prefix for merged PRs so extract/* branches with commit_type='enrich' or 'challenge' surface correctly (same gotcha as the contributor-role wiring fix) - priority: challenge → enrich (incl. reweave/) → maintenance (infra) → new - challenged_by detection carried over from activity_feed_api._classify_event - non-merged statuses unchanged (extract/new/infra/challenge as before) - SQL now selects commit_type + description alongside existing columns - 14 unit tests covering the gotcha matrix Gap 3 — _CHANNEL_MAP reshape: - extract/, ingestion/ default → 'unknown' (was 'telegram'; telegram-origin classification now requires explicit tagging at ingestion time) - agent/maintenance mappings unchanged - github_pr override and gh-pr-* branches continue to return 'github' - 'web' registered as the canonical in-app submission channel (matches the platform-named pattern established by telegram/github/agent) - module docstring enumerates all six valid channels Deployed to VPS; diagnostics + pipeline restarted clean. Smoke: type=enrich returns 22 events (was 0), type=challenge returns 0 (matches DB — zero challenge commit_types). Co-Authored-By: Claude Opus 4.7 (1M context) --- diagnostics/activity_endpoint.py | 85 ++++++++++++++++++++++++++++---- lib/db.py | 22 +++++++-- 2 files changed, 94 insertions(+), 13 deletions(-) diff --git a/diagnostics/activity_endpoint.py b/diagnostics/activity_endpoint.py index 36872fa..71d6fcb 100644 --- a/diagnostics/activity_endpoint.py +++ b/diagnostics/activity_endpoint.py @@ -28,12 +28,9 @@ import sqlite3 import json -# Map PR status to Clay's operation color palette -# extract (cyan), new (green), enrich (amber), challenge (red-orange), -# decision (violet), infra (grey) -STATUS_TO_OPERATION = { - 'merged': 'new', # green — new knowledge merged - 'approved': 'enrich', # amber — approved, enriching KB +# Non-merged statuses map directly to operation — no semantic classification yet. +NON_MERGED_STATUS_TO_OPERATION = { + 'approved': 'new', # about to become knowledge 'open': 'extract', # cyan — new extraction in progress 'validating': 'extract', # cyan — being validated 'reviewing': 'extract', # cyan — under review @@ -43,6 +40,51 @@ STATUS_TO_OPERATION = { 'conflict': 'challenge', # red-orange — conflict detected } +# Maintenance commit_types that land on main but don't represent new knowledge. +_MAINTENANCE_COMMIT_TYPES = {'fix', 'pipeline', 'reweave'} + + +def classify_pr_operation(status, commit_type, branch, description=None): + """Derive a Timeline operation from a PR row. + + Priority order for MERGED PRs (commit_type wins over branch prefix — + extract/* branches with commit_type='enrich' or 'challenge' classify + by commit_type, matching the contributor-role wiring fix): + 1. commit_type == 'challenge' OR branch.startswith('challenge/') OR + description contains 'challenged_by' → 'challenge' + 2. commit_type == 'enrich' OR branch.startswith('enrich/' | 'reweave/') + → 'enrich' + 3. commit_type in _MAINTENANCE_COMMIT_TYPES → 'infra' + 4. default (commit_type='knowledge'|'extract'|'research'|'entity' or + anything else) → 'new' + + For non-merged PRs, falls back to NON_MERGED_STATUS_TO_OPERATION. + """ + commit_type = (commit_type or '').lower() + branch = branch or '' + description_lower = (description or '').lower() + + if status != 'merged': + return NON_MERGED_STATUS_TO_OPERATION.get(status, 'infra') + + # Challenge takes precedence — the signal is inherently more specific. + if (commit_type == 'challenge' + or branch.startswith('challenge/') + or 'challenged_by' in description_lower): + return 'challenge' + + if (commit_type == 'enrich' + or branch.startswith('enrich/') + or branch.startswith('reweave/')): + return 'enrich' + + if commit_type in _MAINTENANCE_COMMIT_TYPES: + return 'infra' + + # Default: legacy 'knowledge', new 'extract', 'research', 'entity', + # unknown/null commit_type → treat as new knowledge. + return 'new' + # Map audit_log stage to operation type STAGE_TO_OPERATION = { 'ingest': 'extract', @@ -118,6 +160,8 @@ async def handle_activity(request): Query params: limit (int, default 100, max 500): number of events to return cursor (ISO timestamp): return events older than this timestamp + type (str, optional): comma-separated operation types to include + (extract|new|enrich|challenge|infra). If absent, returns all types. Derives events from two sources: 1. prs table — per-PR events with domain, agent, status @@ -131,6 +175,13 @@ async def handle_activity(request): limit = 100 cursor = request.query.get('cursor') + type_param = request.query.get('type', '').strip() + allowed_ops = None + if type_param: + allowed_ops = {t.strip() for t in type_param.split(',') if t.strip()} + if not allowed_ops: + allowed_ops = None + db_path = request.app['db_path'] try: @@ -143,22 +194,27 @@ async def handle_activity(request): # Each PR generates events at created_at and merged_at timestamps pr_query = """ SELECT number, status, domain, agent, branch, source_path, - created_at, merged_at, source_channel + created_at, merged_at, source_channel, commit_type, + description FROM prs WHERE {where_clause} ORDER BY COALESCE(merged_at, created_at) DESC LIMIT ? """ + # Over-fetch when filtering by type so we have enough matching rows after + # post-build filtering. Cap at 2000 to avoid runaway queries. + fetch_limit = min(2000, limit * 5) if allowed_ops else limit + 1 + if cursor: rows = conn.execute( pr_query.format(where_clause="COALESCE(merged_at, created_at) < ?"), - (cursor, limit + 1) + (cursor, fetch_limit) ).fetchall() else: rows = conn.execute( pr_query.format(where_clause="1=1"), - (limit + 1,) + (fetch_limit,) ).fetchall() # Known knowledge agents for branch-prefix inference @@ -166,7 +222,14 @@ async def handle_activity(request): for row in rows: row_dict = dict(row) - operation = STATUS_TO_OPERATION.get(row_dict['status'], 'infra') + operation = classify_pr_operation( + row_dict['status'], + row_dict.get('commit_type'), + row_dict.get('branch'), + row_dict.get('description'), + ) + if allowed_ops and operation not in allowed_ops: + continue description = pr_description(row_dict) # Use merged_at if available (more interesting event), else created_at @@ -218,6 +281,8 @@ async def handle_activity(request): for row in audit_rows: row_dict = dict(row) operation = STAGE_TO_OPERATION.get(row_dict['stage'], 'infra') + if allowed_ops and operation not in allowed_ops: + continue description = audit_description(row_dict) events.append({ diff --git a/lib/db.py b/lib/db.py index 8c05e26..7f86605 100644 --- a/lib/db.py +++ b/lib/db.py @@ -232,9 +232,20 @@ def classify_branch(branch: str) -> tuple[str, str]: # Keep in sync with BRANCH_PREFIX_MAP above. +# +# Valid source_channel values: github | telegram | agent | maintenance | web | unknown +# - github: external contributor PR (set via sync-mirror.sh github_pr linking, +# or from gh-pr-* branches, or any time github_pr is provided) +# - telegram: message captured by telegram bot (must be tagged explicitly by +# ingestion — extract/* default is "unknown" because the bare branch prefix +# can no longer distinguish telegram-origin from github-origin extractions) +# - agent: per-agent research branches (rio/, theseus/, etc.) +# - maintenance: pipeline housekeeping (reweave/, epimetheus/, fix/) +# - web: future in-app submissions (chat UI or form posts) +# - unknown: fallback when provenance cannot be determined _CHANNEL_MAP = { - "extract": "telegram", - "ingestion": "telegram", + "extract": "unknown", + "ingestion": "unknown", "rio": "agent", "theseus": "agent", "astra": "agent", @@ -249,7 +260,12 @@ _CHANNEL_MAP = { def classify_source_channel(branch: str, *, github_pr: int = None) -> str: - """Derive source_channel from branch prefix and github_pr flag.""" + """Derive source_channel from branch prefix and github_pr flag. + + Precedence: github_pr flag > gh-pr- branch prefix > _CHANNEL_MAP lookup. + extract/* defaults to "unknown" — callers with better provenance (telegram + bot, web submission handler) must override at PR-insert time. + """ if github_pr is not None or branch.startswith("gh-pr-"): return "github" prefix = branch.split("/", 1)[0] if "/" in branch else branch