From 13f21f77323506eb957bb44361eb69ee6ebeaf27 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 16 Apr 2026 18:14:01 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20external=20contributor=20pipeline=20?= =?UTF-8?q?=E2=80=94=20fork=20PR=20handling,=20attribution,=20prefix=20rec?= =?UTF-8?q?ognition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Mirror: fetch GitHub fork PR refs (refs/pull/*/head), push to Forgejo as gh-pr-N/branch - Mirror: fork PRs auto-create Forgejo PR with GitHub PR title, link github_pr in DB - db.py: add contrib + gh-pr-* to classify_branch for external contributor branches - contributor.py: git commit author as attribution fallback (before branch agent) - contributor.py: skip bot/generic authors (m3taversal, teleo, pipeline) - Tests: fix fallback test for new git author path, add external contributor test Co-Authored-By: Claude Opus 4.6 (1M context) --- deploy/sync-mirror.sh | 82 ++++++++++++++++++++++++++++++++------- lib/contributor.py | 21 ++++++++-- lib/db.py | 4 ++ tests/test_contributor.py | 26 ++++++++++++- 4 files changed, 114 insertions(+), 19 deletions(-) diff --git a/deploy/sync-mirror.sh b/deploy/sync-mirror.sh index 436ad1e..4f18694 100755 --- a/deploy/sync-mirror.sh +++ b/deploy/sync-mirror.sh @@ -47,6 +47,37 @@ fi log "Fetching from GitHub..." git fetch origin --prune >> "$LOG" 2>&1 || log "WARN: GitHub fetch failed" +# Step 2.1: Fetch GitHub fork PR refs +# Fork-based PRs don't create branches on origin — they create refs/pull/N/head +# Fetch these so we can push them to Forgejo for evaluation +GITHUB_PAT_STEP2=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]') +if [ -n "$GITHUB_PAT_STEP2" ]; then + OPEN_PRS=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls?state=open&per_page=100" \ + -H "Authorization: token $GITHUB_PAT_STEP2" 2>/dev/null || echo "[]") + echo "$OPEN_PRS" | python3 -c " +import sys, json +prs = json.load(sys.stdin) +for pr in prs: + head = pr.get('head', {}) + # Only process fork PRs (repo differs from base repo) + base_repo = pr.get('base', {}).get('repo', {}).get('full_name', '') + head_repo = head.get('repo', {}) or {} + head_full = head_repo.get('full_name', '') + if head_full and head_full != base_repo: + print(f\"{pr['number']} {head.get('ref', '')} {head.get('sha', '')}\") +" 2>/dev/null | while read pr_num branch_name head_sha; do + if [ -z "$pr_num" ] || [ -z "$branch_name" ]; then continue; fi + PR_BRANCH="gh-pr-${pr_num}/${branch_name}" + # Check if we already have this ref at the right SHA + EXISTING=$(git rev-parse "refs/heads/$PR_BRANCH" 2>/dev/null || true) + if [ "$EXISTING" = "$head_sha" ]; then continue; fi + # Fetch the PR ref and create a local branch + git fetch origin "refs/pull/${pr_num}/head:refs/heads/$PR_BRANCH" >> "$LOG" 2>&1 && \ + log "Fetched fork PR #$pr_num -> $PR_BRANCH" || \ + log "WARN: Failed to fetch fork PR #$pr_num" + done +fi + # Step 2.5: GitHub main -> Forgejo main (ff-only) # If a PR was merged on GitHub, GitHub main is ahead of Forgejo main. # Fast-forward Forgejo main to match — safe because ff-only guarantees no divergence. @@ -108,10 +139,18 @@ if [ -n "$GITHUB_ONLY" ]; then FORGEJO_TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-admin-token 2>/dev/null) for branch in $GITHUB_ONLY; do log "New from GitHub: $branch -> Forgejo" - git push forgejo "refs/remotes/origin/$branch:refs/heads/$branch" >> "$LOG" 2>&1 || { - log "WARN: Failed to push $branch to Forgejo" - continue - } + # Fork PR branches live as local refs (from Step 2.1), not on origin remote + if [[ "$branch" == gh-pr-* ]]; then + git push forgejo "refs/heads/$branch:refs/heads/$branch" >> "$LOG" 2>&1 || { + log "WARN: Failed to push fork PR branch $branch to Forgejo" + continue + } + else + git push forgejo "refs/remotes/origin/$branch:refs/heads/$branch" >> "$LOG" 2>&1 || { + log "WARN: Failed to push $branch to Forgejo" + continue + } + fi # Auto-create PR on Forgejo for mirrored branches (external contributor path) # Skip pipeline-internal branches case "$branch" in @@ -141,7 +180,17 @@ for line in sys.stdin: print('no') " "$branch" 2>/dev/null || echo "no") if [ "$HAS_PR" = "no" ]; then - PR_TITLE=$(echo "$branch" | sed 's|/|: |;s/-/ /g') + # Build PR title — for fork PRs, use the GitHub PR title + if [[ "$branch" == gh-pr-* ]]; then + FORK_GH_NUM=$(echo "$branch" | sed 's|gh-pr-\([0-9]*\)/.*|\1|') + GITHUB_PAT_T=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]') + PR_TITLE=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls/$FORK_GH_NUM" \ + -H "Authorization: token $GITHUB_PAT_T" 2>/dev/null | \ + python3 -c "import sys,json; print(json.load(sys.stdin).get('title',''))" 2>/dev/null || true) + [ -z "$PR_TITLE" ] && PR_TITLE=$(echo "$branch" | sed 's|/|: |;s/-/ /g') + else + PR_TITLE=$(echo "$branch" | sed 's|/|: |;s/-/ /g') + fi RESULT=$(curl -sf -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \ -H "Authorization: token $FORGEJO_TOKEN" \ -H "Content-Type: application/json" \ @@ -150,17 +199,22 @@ print('no') if [ -n "$PR_NUM" ]; then log "Auto-created PR #$PR_NUM on Forgejo for $branch" # Step 4.5: Link GitHub PR to Forgejo PR in pipeline DB - GITHUB_PAT=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]') - if [ -n "$GITHUB_PAT" ]; then - GH_PR_NUM=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls?head=living-ip:$branch&state=all" \ - -H "Authorization: token $GITHUB_PAT" 2>/dev/null | \ - python3 -c "import sys,json; prs=json.load(sys.stdin); print(prs[0]['number'] if prs else '')" 2>/dev/null || true) - if [ -n "$GH_PR_NUM" ]; then - sqlite3 "$PIPELINE_DB" "UPDATE prs SET github_pr = $GH_PR_NUM WHERE number = $PR_NUM;" 2>/dev/null && \ - log "Linked GitHub PR #$GH_PR_NUM -> Forgejo PR #$PR_NUM" || \ - log "WARN: Failed to link GitHub PR #$GH_PR_NUM to Forgejo PR #$PR_NUM in DB" + if [[ "$branch" == gh-pr-* ]]; then + GH_PR_NUM=$(echo "$branch" | sed 's|gh-pr-\([0-9]*\)/.*|\1|') + else + GITHUB_PAT=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]') + GH_PR_NUM="" + if [ -n "$GITHUB_PAT" ]; then + GH_PR_NUM=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls?head=living-ip:$branch&state=all" \ + -H "Authorization: token $GITHUB_PAT" 2>/dev/null | \ + python3 -c "import sys,json; prs=json.load(sys.stdin); print(prs[0]['number'] if prs else '')" 2>/dev/null || true) fi fi + if [ -n "$GH_PR_NUM" ]; then + sqlite3 "$PIPELINE_DB" "UPDATE prs SET github_pr = $GH_PR_NUM WHERE number = $PR_NUM;" 2>/dev/null && \ + log "Linked GitHub PR #$GH_PR_NUM -> Forgejo PR #$PR_NUM" || \ + log "WARN: Failed to link GitHub PR #$GH_PR_NUM to Forgejo PR #$PR_NUM in DB" + fi else log "WARN: Failed to auto-create PR for $branch" fi diff --git a/lib/contributor.py b/lib/contributor.py index 5080af2..e825384 100644 --- a/lib/contributor.py +++ b/lib/contributor.py @@ -153,11 +153,24 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_ agent_id = agent_id_match.group(1).strip() if agent_id_match else None upsert_contributor(conn, handle, agent_id, current_role, today) - # Fallback: if no attribution block found, credit the branch agent as extractor + # Fallback: if no attribution block found, try git commit author then branch agent if not agents_found: - row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone() - if row and row["agent"]: - upsert_contributor(conn, row["agent"].lower(), None, "extractor", today) + # For external contributors: parse git commit author as attribution source + rc_author, author_output = await git_fn( + "log", f"origin/main..origin/{branch}", "--format=%an", "-1", + timeout=10, + ) + if rc_author == 0 and author_output.strip(): + author_name = author_output.strip().lower() + # Skip generic/bot authors — fall through to branch agent + if author_name not in ("m3taversal", "teleo", "pipeline", ""): + upsert_contributor(conn, author_name, None, "extractor", today) + agents_found.add(author_name) + + if not agents_found: + row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone() + if row and row["agent"] and row["agent"] != "external": + upsert_contributor(conn, row["agent"].lower(), None, "extractor", today) def upsert_contributor( diff --git a/lib/db.py b/lib/db.py index a7c1f3d..1e60075 100644 --- a/lib/db.py +++ b/lib/db.py @@ -208,6 +208,7 @@ BRANCH_PREFIX_MAP = { "leo": ("leo", "entity"), "reweave": ("pipeline", "reweave"), "fix": ("pipeline", "fix"), + "contrib": ("external", "contrib"), } @@ -217,6 +218,9 @@ def classify_branch(branch: str) -> tuple[str, str]: Returns ('unknown', 'unknown') and logs a warning for unrecognized prefixes. """ prefix = branch.split("/", 1)[0] if "/" in branch else branch + # Fork PR branches: gh-pr-N/original-branch + if prefix.startswith("gh-pr-"): + return ("external", "contrib") result = BRANCH_PREFIX_MAP.get(prefix) if result is None: logger.warning("Unknown branch prefix %r in branch %r — defaulting to ('unknown', 'unknown')", prefix, branch) diff --git a/tests/test_contributor.py b/tests/test_contributor.py index 7de0463..993133a 100644 --- a/tests/test_contributor.py +++ b/tests/test_contributor.py @@ -200,7 +200,8 @@ def test_record_fallback_to_pr_agent(): async def run(): with patch("lib.contributor.get_pr_diff", new_callable=AsyncMock, return_value=mock_diff): - git_fn = AsyncMock(return_value=(0, "no trailers here")) + # First call: trailer log (no trailers), Second call: author log (bot name → skipped) + git_fn = AsyncMock(side_effect=[(0, "no trailers here"), (0, "m3taversal")]) with patch("lib.contributor.config") as mock_config: mock_config.CONTRIBUTOR_TIER_RULES = { "veteran": {"claims_merged": 50, "min_days_since_first": 90, "challenges_survived": 5}, @@ -213,6 +214,29 @@ def test_record_fallback_to_pr_agent(): assert row is not None assert row["extractor_count"] == 1 + +def test_record_fallback_to_git_author(): + """External contributors get credited via git commit author.""" + conn = _make_attribution_db() + conn.execute("INSERT INTO prs VALUES (200, 'contrib', 'external')") + mock_diff = "+++ b/domains/ai-alignment/claim.md\n+new content\n" + + async def run(): + with patch("lib.contributor.get_pr_diff", new_callable=AsyncMock, return_value=mock_diff): + # First call: trailer log (no trailers), Second call: author log (external name) + git_fn = AsyncMock(side_effect=[(0, "no trailers"), (0, "Cameron-S1")]) + with patch("lib.contributor.config") as mock_config: + mock_config.CONTRIBUTOR_TIER_RULES = { + "veteran": {"claims_merged": 50, "min_days_since_first": 90, "challenges_survived": 5}, + "contributor": {"claims_merged": 10}, + } + await record_contributor_attribution(conn, 200, "contrib/cameron/challenge", git_fn) + + asyncio.run(run()) + row = conn.execute("SELECT * FROM contributors WHERE handle = 'cameron-s1'").fetchone() + assert row is not None + assert row["extractor_count"] == 1 + def test_record_parses_pentagon_trailer(): conn = _make_attribution_db() mock_diff = "+++ b/domains/crypto/claim.md\n+new file content\n"