leo: fix evaluate-trigger.sh — 4 bugs + auto-merge support

- Add foundations/ to always-allowed territory paths so domain agents can propose foundation claims
- Add Astra/space-development to domain routing map
- Fix double check_merge_eligible call by capturing exit code
- Update Leo prompt from 8 to 11 quality criteria (scope, universals, counter-evidence)
- Add auto-merge capability with territory violation checks
- Add --no-merge flag for review-only mode
- Widen domain agent verdict parsing to catch various comment formats

Pentagon-Agent: Leo <B9E87C91-8D2A-42C0-AA43-4874B1A67642>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-03-08 19:01:42 +00:00
parent 2bf0a68917
commit 876a01a4da

View file

@ -1,15 +1,21 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# evaluate-trigger.sh — Find unreviewed PRs and run 2-agent review on each. # evaluate-trigger.sh — Find unreviewed PRs, run 2-agent review, auto-merge if approved.
# #
# Reviews each PR with TWO agents: # Reviews each PR with TWO agents:
# 1. Leo (evaluator) — quality gates, cross-domain connections, coherence # 1. Leo (evaluator) — quality gates, cross-domain connections, coherence
# 2. Domain agent — domain expertise, duplicate check, technical accuracy # 2. Domain agent — domain expertise, duplicate check, technical accuracy
# #
# After both reviews, auto-merges if:
# - Leo approved (gh pr review --approve)
# - Domain agent verdict is "Approve" (parsed from comment)
# - No territory violations (files outside proposer's domain)
#
# Usage: # Usage:
# ./ops/evaluate-trigger.sh # review all unreviewed open PRs # ./ops/evaluate-trigger.sh # review + auto-merge approved PRs
# ./ops/evaluate-trigger.sh 47 # review a specific PR by number # ./ops/evaluate-trigger.sh 47 # review a specific PR by number
# ./ops/evaluate-trigger.sh --dry-run # show what would be reviewed, don't run # ./ops/evaluate-trigger.sh --dry-run # show what would be reviewed, don't run
# ./ops/evaluate-trigger.sh --leo-only # skip domain agent, just run Leo # ./ops/evaluate-trigger.sh --leo-only # skip domain agent, just run Leo
# ./ops/evaluate-trigger.sh --no-merge # review only, don't auto-merge (old behavior)
# #
# Requirements: # Requirements:
# - claude CLI (claude -p for headless mode) # - claude CLI (claude -p for headless mode)
@ -18,7 +24,7 @@
# #
# Safety: # Safety:
# - Lockfile prevents concurrent runs # - Lockfile prevents concurrent runs
# - Neither agent auto-merges — reviews only # - Auto-merge requires ALL reviewers to approve + no territory violations
# - Each PR runs sequentially to avoid branch conflicts # - Each PR runs sequentially to avoid branch conflicts
# - Timeout: 10 minutes per agent per PR # - Timeout: 10 minutes per agent per PR
# - Pre-flight checks: clean working tree, gh auth # - Pre-flight checks: clean working tree, gh auth
@ -36,6 +42,7 @@ LOG_DIR="$REPO_ROOT/ops/sessions"
TIMEOUT_SECONDS=600 TIMEOUT_SECONDS=600
DRY_RUN=false DRY_RUN=false
LEO_ONLY=false LEO_ONLY=false
NO_MERGE=false
SPECIFIC_PR="" SPECIFIC_PR=""
# --- Domain routing map --- # --- Domain routing map ---
@ -53,6 +60,7 @@ detect_domain_agent() {
clay/*|*/entertainment*) agent="clay"; domain="entertainment" ;; clay/*|*/entertainment*) agent="clay"; domain="entertainment" ;;
theseus/*|logos/*|*/ai-alignment*) agent="theseus"; domain="ai-alignment" ;; theseus/*|logos/*|*/ai-alignment*) agent="theseus"; domain="ai-alignment" ;;
vida/*|*/health*) agent="vida"; domain="health" ;; vida/*|*/health*) agent="vida"; domain="health" ;;
astra/*|*/space-development*) agent="astra"; domain="space-development" ;;
leo/*|*/grand-strategy*) agent="leo"; domain="grand-strategy" ;; leo/*|*/grand-strategy*) agent="leo"; domain="grand-strategy" ;;
*) *)
# Fall back to checking which domain directory has changed files # Fall back to checking which domain directory has changed files
@ -64,6 +72,8 @@ detect_domain_agent() {
agent="theseus"; domain="ai-alignment" agent="theseus"; domain="ai-alignment"
elif echo "$files" | grep -q "domains/health/"; then elif echo "$files" | grep -q "domains/health/"; then
agent="vida"; domain="health" agent="vida"; domain="health"
elif echo "$files" | grep -q "domains/space-development/"; then
agent="astra"; domain="space-development"
else else
agent=""; domain="" agent=""; domain=""
fi fi
@ -78,6 +88,7 @@ for arg in "$@"; do
case "$arg" in case "$arg" in
--dry-run) DRY_RUN=true ;; --dry-run) DRY_RUN=true ;;
--leo-only) LEO_ONLY=true ;; --leo-only) LEO_ONLY=true ;;
--no-merge) NO_MERGE=true ;;
[0-9]*) SPECIFIC_PR="$arg" ;; [0-9]*) SPECIFIC_PR="$arg" ;;
--help|-h) --help|-h)
head -23 "$0" | tail -21 head -23 "$0" | tail -21
@ -208,8 +219,145 @@ run_agent_review() {
fi fi
} }
# --- Territory violation check ---
# Verifies all changed files are within the proposer's expected territory
check_territory_violations() {
local pr_number="$1"
local branch files proposer violations
branch=$(gh pr view "$pr_number" --json headRefName --jq '.headRefName' 2>/dev/null || echo "")
files=$(gh pr view "$pr_number" --json files --jq '.files[].path' 2>/dev/null || echo "")
# Determine proposer from branch prefix
proposer=$(echo "$branch" | cut -d'/' -f1)
# Map proposer to allowed directories
local allowed_domains=""
case "$proposer" in
rio) allowed_domains="domains/internet-finance/" ;;
clay) allowed_domains="domains/entertainment/" ;;
theseus) allowed_domains="domains/ai-alignment/" ;;
vida) allowed_domains="domains/health/" ;;
astra) allowed_domains="domains/space-development/" ;;
leo) allowed_domains="core/|foundations/" ;;
*) echo ""; return 0 ;; # Unknown proposer — skip check
esac
# Check each file — allow inbox/archive/, agents/{proposer}/, schemas/, foundations/, and the agent's domain
violations=""
while IFS= read -r file; do
[ -z "$file" ] && continue
# Always allowed: inbox/archive, own agent dir, maps/, foundations/ (any agent can propose foundation claims)
if echo "$file" | grep -qE "^inbox/archive/|^agents/${proposer}/|^maps/|^foundations/"; then
continue
fi
# Check against allowed domain directories
if echo "$file" | grep -qE "^${allowed_domains}"; then
continue
fi
violations="${violations} - ${file}\n"
done <<< "$files"
if [ -n "$violations" ]; then
echo -e "$violations"
else
echo ""
fi
}
# --- Auto-merge check ---
# Returns 0 if PR should be merged, 1 if not
check_merge_eligible() {
local pr_number="$1"
local domain_agent="$2"
local leo_passed="$3"
# Gate 1: Leo must have passed
if [ "$leo_passed" != "true" ]; then
echo "BLOCK: Leo review failed or timed out"
return 1
fi
# Gate 2: Check Leo's review state via GitHub API
local leo_review_state
leo_review_state=$(gh api "repos/{owner}/{repo}/pulls/${pr_number}/reviews" \
--jq '[.[] | select(.state != "DISMISSED" and .state != "PENDING")] | last | .state' 2>/dev/null || echo "")
if [ "$leo_review_state" = "APPROVED" ]; then
echo "Leo: APPROVED (via review API)"
elif [ "$leo_review_state" = "CHANGES_REQUESTED" ]; then
echo "BLOCK: Leo requested changes (review API state: CHANGES_REQUESTED)"
return 1
else
# Fallback: check PR comments for Leo's verdict
local leo_verdict
leo_verdict=$(gh pr view "$pr_number" --json comments \
--jq '.comments[] | select(.body | test("## Leo Review")) | .body' 2>/dev/null \
| grep -oiE '\*\*Verdict:[^*]+\*\*' | tail -1 || echo "")
if echo "$leo_verdict" | grep -qi "approve"; then
echo "Leo: APPROVED (via comment verdict)"
elif echo "$leo_verdict" | grep -qi "request changes\|reject"; then
echo "BLOCK: Leo verdict: $leo_verdict"
return 1
else
echo "BLOCK: Could not determine Leo's verdict"
return 1
fi
fi
# Gate 3: Check domain agent verdict (if applicable)
if [ -n "$domain_agent" ] && [ "$domain_agent" != "leo" ]; then
local domain_verdict
# Search for verdict in domain agent's review — match agent name, "domain reviewer", or "Domain Review"
domain_verdict=$(gh pr view "$pr_number" --json comments \
--jq ".comments[] | select(.body | test(\"domain review|${domain_agent}|peer review\"; \"i\")) | .body" 2>/dev/null \
| grep -oiE '\*\*Verdict:[^*]+\*\*' | tail -1 || echo "")
if [ -z "$domain_verdict" ]; then
# Also check review API for domain agent approval
# Since all agents use the same GitHub account, we check for multiple approvals
local approval_count
approval_count=$(gh api "repos/{owner}/{repo}/pulls/${pr_number}/reviews" \
--jq '[.[] | select(.state == "APPROVED")] | length' 2>/dev/null || echo "0")
if [ "$approval_count" -ge 2 ]; then
echo "Domain agent: APPROVED (multiple approvals via review API)"
else
echo "BLOCK: No domain agent verdict found"
return 1
fi
elif echo "$domain_verdict" | grep -qi "approve"; then
echo "Domain agent ($domain_agent): APPROVED (via comment verdict)"
elif echo "$domain_verdict" | grep -qi "request changes\|reject"; then
echo "BLOCK: Domain agent verdict: $domain_verdict"
return 1
else
echo "BLOCK: Unclear domain agent verdict: $domain_verdict"
return 1
fi
else
echo "Domain agent: N/A (leo-only or grand-strategy)"
fi
# Gate 4: Territory violations
local violations
violations=$(check_territory_violations "$pr_number")
if [ -n "$violations" ]; then
echo "BLOCK: Territory violations detected:"
echo -e "$violations"
return 1
else
echo "Territory: clean"
fi
return 0
}
REVIEWED=0 REVIEWED=0
FAILED=0 FAILED=0
MERGED=0
for pr in $PRS_TO_REVIEW; do for pr in $PRS_TO_REVIEW; do
echo "" echo ""
@ -235,7 +383,7 @@ Before evaluating, scan the existing knowledge base for duplicate and contradict
- Read titles to check for semantic duplicates - Read titles to check for semantic duplicates
- Check for contradictions with existing claims in that domain and in foundations/ - Check for contradictions with existing claims in that domain and in foundations/
For each proposed claim, evaluate against these 8 quality criteria from CLAUDE.md: For each proposed claim, evaluate against these 11 quality criteria from CLAUDE.md:
1. Specificity — Is this specific enough to disagree with? 1. Specificity — Is this specific enough to disagree with?
2. Evidence — Is there traceable evidence in the body? 2. Evidence — Is there traceable evidence in the body?
3. Description quality — Does the description add info beyond the title? 3. Description quality — Does the description add info beyond the title?
@ -244,6 +392,9 @@ For each proposed claim, evaluate against these 8 quality criteria from CLAUDE.m
6. Contradiction check — Does this contradict an existing claim? If so, is the contradiction explicit? 6. Contradiction check — Does this contradict an existing claim? If so, is the contradiction explicit?
7. Value add — Does this genuinely expand what the knowledge base knows? 7. Value add — Does this genuinely expand what the knowledge base knows?
8. Wiki links — Do all [[links]] point to real files? 8. Wiki links — Do all [[links]] point to real files?
9. Scope qualification — Does the claim specify structural vs functional, micro vs macro, causal vs correlational?
10. Universal quantifier check — Does the title use unwarranted universals (all, always, never, the only)?
11. Counter-evidence acknowledgment — For likely or higher: is opposing evidence acknowledged?
Also check: Also check:
- Source archive updated correctly (status field) - Source archive updated correctly (status field)
@ -257,7 +408,7 @@ Then post it with: gh pr review ${pr} --comment --body-file ${LEO_REVIEW_FILE}
If ALL claims pass quality gates: gh pr review ${pr} --approve --body-file ${LEO_REVIEW_FILE} If ALL claims pass quality gates: gh pr review ${pr} --approve --body-file ${LEO_REVIEW_FILE}
If ANY claim needs changes: gh pr review ${pr} --request-changes --body-file ${LEO_REVIEW_FILE} If ANY claim needs changes: gh pr review ${pr} --request-changes --body-file ${LEO_REVIEW_FILE}
DO NOT merge. Leave the merge decision to Cory. DO NOT merge — the orchestrator handles merge decisions after all reviews are posted.
Work autonomously. Do not ask for confirmation." Work autonomously. Do not ask for confirmation."
if run_agent_review "$pr" "leo" "$LEO_PROMPT" "opus"; then if run_agent_review "$pr" "leo" "$LEO_PROMPT" "opus"; then
@ -305,7 +456,7 @@ Post it with: gh pr review ${pr} --comment --body-file ${DOMAIN_REVIEW_FILE}
Sign your review as ${AGENT_NAME_UPPER} (domain reviewer for ${DOMAIN}). Sign your review as ${AGENT_NAME_UPPER} (domain reviewer for ${DOMAIN}).
DO NOT duplicate Leo's quality gate checks — he covers those. DO NOT duplicate Leo's quality gate checks — he covers those.
DO NOT merge. DO NOT merge — the orchestrator handles merge decisions after all reviews are posted.
Work autonomously. Do not ask for confirmation." Work autonomously. Do not ask for confirmation."
run_agent_review "$pr" "$DOMAIN_AGENT" "$DOMAIN_PROMPT" "sonnet" run_agent_review "$pr" "$DOMAIN_AGENT" "$DOMAIN_PROMPT" "sonnet"
@ -321,6 +472,31 @@ Work autonomously. Do not ask for confirmation."
FAILED=$((FAILED + 1)) FAILED=$((FAILED + 1))
fi fi
# --- Auto-merge decision ---
if [ "$NO_MERGE" = true ]; then
echo " Auto-merge: skipped (--no-merge)"
elif [ "$LEO_PASSED" != "true" ]; then
echo " Auto-merge: skipped (Leo review failed)"
else
echo ""
echo " --- Merge eligibility check ---"
MERGE_LOG=$(check_merge_eligible "$pr" "$DOMAIN_AGENT" "$LEO_PASSED")
MERGE_RESULT=$?
echo "$MERGE_LOG" | sed 's/^/ /'
if [ "$MERGE_RESULT" -eq 0 ]; then
echo " Auto-merge: ALL GATES PASSED — merging PR #$pr"
if gh pr merge "$pr" --squash --delete-branch 2>&1; then
echo " PR #$pr: MERGED successfully."
MERGED=$((MERGED + 1))
else
echo " PR #$pr: Merge FAILED. May need manual intervention."
fi
else
echo " Auto-merge: BLOCKED — see reasons above"
fi
fi
echo "Finished: $(date)" echo "Finished: $(date)"
done done
@ -328,4 +504,5 @@ echo ""
echo "=== Summary ===" echo "=== Summary ==="
echo "Reviewed: $REVIEWED" echo "Reviewed: $REVIEWED"
echo "Failed: $FAILED" echo "Failed: $FAILED"
echo "Merged: $MERGED"
echo "Logs: $LOG_DIR" echo "Logs: $LOG_DIR"