From ca96f5f8e3332ca0a86f550a8931421954bcceb7 Mon Sep 17 00:00:00 2001 From: twentyOne2x Date: Fri, 29 May 2026 14:16:12 +0200 Subject: [PATCH] Harden local phase 1b review path --- docs/phase1b-agent-routing-spec.md | 13 ++++---- .../phase1b/eval-pipeline-integration-spec.md | 8 ++--- lib/domains.py | 10 ++---- lib/evaluate.py | 32 +++++++++++++++---- tests/test_evaluate_agent_routing.py | 26 ++++++++++++++- 5 files changed, 63 insertions(+), 26 deletions(-) diff --git a/docs/phase1b-agent-routing-spec.md b/docs/phase1b-agent-routing-spec.md index 23760aa..e71d119 100644 --- a/docs/phase1b-agent-routing-spec.md +++ b/docs/phase1b-agent-routing-spec.md @@ -150,18 +150,17 @@ Pre-implementation repo truth: Estimated implementation progress on this branch: - B1 classifier foundation: 100 percent locally, pending staging calibration. -- B2 routing layer: 70 percent locally behind a default-off feature flag. -- Cross-domain top-2 review: 70 percent locally through mocked eval proof. -- Local proof suite: 80 percent for router/eval/parser scope. +- B2 routing layer: 75 percent locally behind a default-off feature flag. +- Cross-domain top-2 review: 75 percent locally through mocked eval proof. +- Local proof suite: 85 percent for router/eval/parser scope. - Staging or VPS proof: 0 percent. Remaining delta: 1. Decide whether the production Phase 1b transport stays Forgejo-first for cutover or switches direct to GitHub `decision-engine` before staging. -2. Add stronger idempotency for duplicate comments on retry after partial multi-agent success. -3. Update reporting/health compatibility beyond `review_records` if staging shows false readiness. -4. Prove against staging before production. -5. Deploy only an exact reviewed/tested SHA after Leo signoff. +2. Update reporting/health compatibility beyond `review_records` if staging shows false readiness. +3. Prove against staging before production. +4. Deploy only an exact reviewed/tested SHA after Leo signoff. ## Closure, Endpoint, And Deployment Truth diff --git a/docs/phase1b/eval-pipeline-integration-spec.md b/docs/phase1b/eval-pipeline-integration-spec.md index 616d6d2..28e8f3d 100644 --- a/docs/phase1b/eval-pipeline-integration-spec.md +++ b/docs/phase1b/eval-pipeline-integration-spec.md @@ -71,14 +71,13 @@ Branch truth: ## Completion Percent And Remaining Delta -Current completion on this branch: 70 percent local implementation behind a default-off feature flag. +Current completion on this branch: 75 percent local implementation behind a default-off feature flag. Remaining delta: 1. Decide direct GitHub `decision-engine` comment transport versus Forgejo-first cutover compatibility. -2. Add duplicate-comment idempotency for retry after partial multi-agent success. -3. Prove with staging PRs and real daemon logs. -4. Update contributor/dashboard assumptions only where staging or tests prove breakage. +2. Prove with staging PRs and real daemon logs. +3. Update contributor/dashboard assumptions only where staging or tests prove breakage. ## Closure, Endpoint, And Deployment Truth @@ -138,6 +137,7 @@ Minimal DB field use: - `domain_verdict`: keep aggregate non-Leo review verdict or aggregate verdict. - `leo_verdict`: set `skipped` unless Leo is a required agent; if Leo is required, store Leo verdict. - `review_records`: write one row per required reviewer attempt with reviewer agent, model, outcome, and notes. +- review comments include a `PHASE1B_REVIEW` marker and the current local helper suppresses duplicate posts for the same PR and agent. - audit log: route and all per-agent verdicts. This is a compatibility posture, not the ideal long-term schema. diff --git a/lib/domains.py b/lib/domains.py index 825a84a..bb1979a 100644 --- a/lib/domains.py +++ b/lib/domains.py @@ -12,20 +12,14 @@ DOMAIN_AGENT_MAP: dict[str, str] = { "entertainment": "Clay", "health": "Vida", "ai-alignment": "Theseus", - "ai-systems": "Theseus", "space-development": "Astra", - "space": "Astra", - "robotics": "Astra", - "energy": "Astra", - "manufacturing": "Astra", - "advanced-manufacturing": "Astra", "mechanisms": "Rio", "living-capital": "Rio", "living-agents": "Theseus", "teleohumanity": "Leo", "grand-strategy": "Leo", "critical-systems": "Theseus", - "collective-intelligence": "Leo", + "collective-intelligence": "Theseus", "teleological-economics": "Rio", "cultural-dynamics": "Clay", } @@ -37,7 +31,7 @@ VALID_DOMAINS: frozenset[str] = frozenset(DOMAIN_AGENT_MAP.keys()) _AGENT_PRIMARY_DOMAIN: dict[str, str] = { "rio": "internet-finance", "clay": "entertainment", - "theseus": "ai-systems", + "theseus": "ai-alignment", "vida": "health", "astra": "space-development", "leo": "grand-strategy", diff --git a/lib/evaluate.py b/lib/evaluate.py index 530dcf4..22fd8d1 100644 --- a/lib/evaluate.py +++ b/lib/evaluate.py @@ -80,6 +80,30 @@ def _phase1b_compat_verdicts(agent_verdicts: dict[str, str]) -> tuple[str, str]: return leo_verdict, domain_verdict +def _phase1b_review_marker(pr_number: int, agent: str) -> str: + return f"" + + +async def _post_phase1b_review_comment(pr_number: int, agent: str, review_text: str) -> bool: + """Post a routed review comment once per PR/agent marker.""" + marker = _phase1b_review_marker(pr_number, agent) + comments = await forgejo_api("GET", repo_path(f"issues/{pr_number}/comments")) + if isinstance(comments, list): + for comment in comments: + body = comment.get("body", "") if isinstance(comment, dict) else "" + if marker in body: + logger.info("PR #%d: Phase 1b %s review comment already posted", pr_number, agent) + return False + + body = review_text if marker in review_text else f"{marker}\n{review_text}" + result = await forgejo_api( + "POST", + repo_path(f"issues/{pr_number}/comments"), + {"body": body}, + ) + return result is not None + + async def _evaluate_pr_phase1b( conn, pr_number: int, @@ -133,18 +157,14 @@ async def _evaluate_pr_phase1b( agent_verdicts[agent] = verdict usage_by_agent[agent] = usage - await forgejo_api( - "POST", - repo_path(f"issues/{pr_number}/comments"), - {"body": review_text}, - ) + await _post_phase1b_review_comment(pr_number, agent, review_text) db.record_review( conn, pr_number, "approved" if verdict == "approve" else "rejected", domain=domain, - agent=route.primary_agent, + agent=agent, reviewer=agent, reviewer_model=_phase1b_review_model(agent, tier), rejection_reason=",".join(parse_issues(review_text)) if verdict == "request_changes" else None, diff --git a/tests/test_evaluate_agent_routing.py b/tests/test_evaluate_agent_routing.py index af61a38..acfcd76 100644 --- a/tests/test_evaluate_agent_routing.py +++ b/tests/test_evaluate_agent_routing.py @@ -6,7 +6,7 @@ from unittest.mock import AsyncMock import pytest from lib import config -from lib.evaluate import _evaluate_pr_phase1b, evaluate_pr +from lib.evaluate import _evaluate_pr_phase1b, _post_phase1b_review_comment, evaluate_pr @pytest.fixture @@ -116,6 +116,8 @@ async def _fake_agent_review_reject_vida(_diff, _files, agent, _route_context, t async def _fake_forgejo_api(method, path, body=None, token=None): + if method == "GET" and "comments" in path: + return [] if method == "GET" and "pulls/" in path: return {"user": {"login": "contributor"}} return {"id": 1} @@ -154,6 +156,10 @@ async def test_phase1b_cross_domain_approves_after_all_required_agents(phase1b_c assert row["domain_agent"] in {"Theseus", "Rio"} review_count = conn.execute("SELECT COUNT(*) AS n FROM review_records WHERE pr_number = 1").fetchone()["n"] assert review_count == 2 + reviewers = { + row["agent"] for row in conn.execute("SELECT agent FROM review_records WHERE pr_number = 1").fetchall() + } + assert reviewers == {"Theseus", "Rio"} post_formal.assert_awaited_once() @@ -212,3 +218,21 @@ async def test_evaluate_pr_flag_uses_phase1b_and_not_legacy_reviewers(phase1b_co assert result["agent_verdicts"] == {"Rio": "approve"} legacy_domain.assert_not_awaited() legacy_leo.assert_not_awaited() + + +@pytest.mark.asyncio +async def test_phase1b_review_comment_is_idempotent(monkeypatch): + calls = [] + + async def fake_api(method, path, body=None, token=None): + calls.append((method, path, body)) + if method == "GET": + return [{"body": "\nold review"}] + return {"id": 1} + + monkeypatch.setattr("lib.evaluate.forgejo_api", fake_api) + + posted = await _post_phase1b_review_comment(7, "Rio", "new review\n") + + assert posted is False + assert [call[0] for call in calls] == ["GET"]