feat: external contributor pipeline — fork PR handling, attribution, prefix recognition

- Mirror: fetch GitHub fork PR refs (refs/pull/*/head), push to Forgejo as gh-pr-N/branch
- Mirror: fork PRs auto-create Forgejo PR with GitHub PR title, link github_pr in DB
- db.py: add contrib + gh-pr-* to classify_branch for external contributor branches
- contributor.py: git commit author as attribution fallback (before branch agent)
- contributor.py: skip bot/generic authors (m3taversal, teleo, pipeline)
- Tests: fix fallback test for new git author path, add external contributor test

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
m3taversal 2026-04-16 18:14:01 +01:00
parent 0b28c71e11
commit 13f21f7732
4 changed files with 114 additions and 19 deletions

View file

@ -47,6 +47,37 @@ fi
log "Fetching from GitHub..."
git fetch origin --prune >> "$LOG" 2>&1 || log "WARN: GitHub fetch failed"
# Step 2.1: Fetch GitHub fork PR refs
# Fork-based PRs don't create branches on origin — they create refs/pull/N/head
# Fetch these so we can push them to Forgejo for evaluation
GITHUB_PAT_STEP2=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]')
if [ -n "$GITHUB_PAT_STEP2" ]; then
OPEN_PRS=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls?state=open&per_page=100" \
-H "Authorization: token $GITHUB_PAT_STEP2" 2>/dev/null || echo "[]")
echo "$OPEN_PRS" | python3 -c "
import sys, json
prs = json.load(sys.stdin)
for pr in prs:
head = pr.get('head', {})
# Only process fork PRs (repo differs from base repo)
base_repo = pr.get('base', {}).get('repo', {}).get('full_name', '')
head_repo = head.get('repo', {}) or {}
head_full = head_repo.get('full_name', '')
if head_full and head_full != base_repo:
print(f\"{pr['number']} {head.get('ref', '')} {head.get('sha', '')}\")
" 2>/dev/null | while read pr_num branch_name head_sha; do
if [ -z "$pr_num" ] || [ -z "$branch_name" ]; then continue; fi
PR_BRANCH="gh-pr-${pr_num}/${branch_name}"
# Check if we already have this ref at the right SHA
EXISTING=$(git rev-parse "refs/heads/$PR_BRANCH" 2>/dev/null || true)
if [ "$EXISTING" = "$head_sha" ]; then continue; fi
# Fetch the PR ref and create a local branch
git fetch origin "refs/pull/${pr_num}/head:refs/heads/$PR_BRANCH" >> "$LOG" 2>&1 && \
log "Fetched fork PR #$pr_num -> $PR_BRANCH" || \
log "WARN: Failed to fetch fork PR #$pr_num"
done
fi
# Step 2.5: GitHub main -> Forgejo main (ff-only)
# If a PR was merged on GitHub, GitHub main is ahead of Forgejo main.
# Fast-forward Forgejo main to match — safe because ff-only guarantees no divergence.
@ -108,10 +139,18 @@ if [ -n "$GITHUB_ONLY" ]; then
FORGEJO_TOKEN=$(cat /opt/teleo-eval/secrets/forgejo-admin-token 2>/dev/null)
for branch in $GITHUB_ONLY; do
log "New from GitHub: $branch -> Forgejo"
git push forgejo "refs/remotes/origin/$branch:refs/heads/$branch" >> "$LOG" 2>&1 || {
log "WARN: Failed to push $branch to Forgejo"
continue
}
# Fork PR branches live as local refs (from Step 2.1), not on origin remote
if [[ "$branch" == gh-pr-* ]]; then
git push forgejo "refs/heads/$branch:refs/heads/$branch" >> "$LOG" 2>&1 || {
log "WARN: Failed to push fork PR branch $branch to Forgejo"
continue
}
else
git push forgejo "refs/remotes/origin/$branch:refs/heads/$branch" >> "$LOG" 2>&1 || {
log "WARN: Failed to push $branch to Forgejo"
continue
}
fi
# Auto-create PR on Forgejo for mirrored branches (external contributor path)
# Skip pipeline-internal branches
case "$branch" in
@ -141,7 +180,17 @@ for line in sys.stdin:
print('no')
" "$branch" 2>/dev/null || echo "no")
if [ "$HAS_PR" = "no" ]; then
PR_TITLE=$(echo "$branch" | sed 's|/|: |;s/-/ /g')
# Build PR title — for fork PRs, use the GitHub PR title
if [[ "$branch" == gh-pr-* ]]; then
FORK_GH_NUM=$(echo "$branch" | sed 's|gh-pr-\([0-9]*\)/.*|\1|')
GITHUB_PAT_T=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]')
PR_TITLE=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls/$FORK_GH_NUM" \
-H "Authorization: token $GITHUB_PAT_T" 2>/dev/null | \
python3 -c "import sys,json; print(json.load(sys.stdin).get('title',''))" 2>/dev/null || true)
[ -z "$PR_TITLE" ] && PR_TITLE=$(echo "$branch" | sed 's|/|: |;s/-/ /g')
else
PR_TITLE=$(echo "$branch" | sed 's|/|: |;s/-/ /g')
fi
RESULT=$(curl -sf -X POST "http://localhost:3000/api/v1/repos/teleo/teleo-codex/pulls" \
-H "Authorization: token $FORGEJO_TOKEN" \
-H "Content-Type: application/json" \
@ -150,17 +199,22 @@ print('no')
if [ -n "$PR_NUM" ]; then
log "Auto-created PR #$PR_NUM on Forgejo for $branch"
# Step 4.5: Link GitHub PR to Forgejo PR in pipeline DB
GITHUB_PAT=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]')
if [ -n "$GITHUB_PAT" ]; then
GH_PR_NUM=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls?head=living-ip:$branch&state=all" \
-H "Authorization: token $GITHUB_PAT" 2>/dev/null | \
python3 -c "import sys,json; prs=json.load(sys.stdin); print(prs[0]['number'] if prs else '')" 2>/dev/null || true)
if [ -n "$GH_PR_NUM" ]; then
sqlite3 "$PIPELINE_DB" "UPDATE prs SET github_pr = $GH_PR_NUM WHERE number = $PR_NUM;" 2>/dev/null && \
log "Linked GitHub PR #$GH_PR_NUM -> Forgejo PR #$PR_NUM" || \
log "WARN: Failed to link GitHub PR #$GH_PR_NUM to Forgejo PR #$PR_NUM in DB"
if [[ "$branch" == gh-pr-* ]]; then
GH_PR_NUM=$(echo "$branch" | sed 's|gh-pr-\([0-9]*\)/.*|\1|')
else
GITHUB_PAT=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]')
GH_PR_NUM=""
if [ -n "$GITHUB_PAT" ]; then
GH_PR_NUM=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls?head=living-ip:$branch&state=all" \
-H "Authorization: token $GITHUB_PAT" 2>/dev/null | \
python3 -c "import sys,json; prs=json.load(sys.stdin); print(prs[0]['number'] if prs else '')" 2>/dev/null || true)
fi
fi
if [ -n "$GH_PR_NUM" ]; then
sqlite3 "$PIPELINE_DB" "UPDATE prs SET github_pr = $GH_PR_NUM WHERE number = $PR_NUM;" 2>/dev/null && \
log "Linked GitHub PR #$GH_PR_NUM -> Forgejo PR #$PR_NUM" || \
log "WARN: Failed to link GitHub PR #$GH_PR_NUM to Forgejo PR #$PR_NUM in DB"
fi
else
log "WARN: Failed to auto-create PR for $branch"
fi

View file

@ -153,11 +153,24 @@ async def record_contributor_attribution(conn, pr_number: int, branch: str, git_
agent_id = agent_id_match.group(1).strip() if agent_id_match else None
upsert_contributor(conn, handle, agent_id, current_role, today)
# Fallback: if no attribution block found, credit the branch agent as extractor
# Fallback: if no attribution block found, try git commit author then branch agent
if not agents_found:
row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone()
if row and row["agent"]:
upsert_contributor(conn, row["agent"].lower(), None, "extractor", today)
# For external contributors: parse git commit author as attribution source
rc_author, author_output = await git_fn(
"log", f"origin/main..origin/{branch}", "--format=%an", "-1",
timeout=10,
)
if rc_author == 0 and author_output.strip():
author_name = author_output.strip().lower()
# Skip generic/bot authors — fall through to branch agent
if author_name not in ("m3taversal", "teleo", "pipeline", ""):
upsert_contributor(conn, author_name, None, "extractor", today)
agents_found.add(author_name)
if not agents_found:
row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone()
if row and row["agent"] and row["agent"] != "external":
upsert_contributor(conn, row["agent"].lower(), None, "extractor", today)
def upsert_contributor(

View file

@ -208,6 +208,7 @@ BRANCH_PREFIX_MAP = {
"leo": ("leo", "entity"),
"reweave": ("pipeline", "reweave"),
"fix": ("pipeline", "fix"),
"contrib": ("external", "contrib"),
}
@ -217,6 +218,9 @@ def classify_branch(branch: str) -> tuple[str, str]:
Returns ('unknown', 'unknown') and logs a warning for unrecognized prefixes.
"""
prefix = branch.split("/", 1)[0] if "/" in branch else branch
# Fork PR branches: gh-pr-N/original-branch
if prefix.startswith("gh-pr-"):
return ("external", "contrib")
result = BRANCH_PREFIX_MAP.get(prefix)
if result is None:
logger.warning("Unknown branch prefix %r in branch %r — defaulting to ('unknown', 'unknown')", prefix, branch)

View file

@ -200,7 +200,8 @@ def test_record_fallback_to_pr_agent():
async def run():
with patch("lib.contributor.get_pr_diff", new_callable=AsyncMock, return_value=mock_diff):
git_fn = AsyncMock(return_value=(0, "no trailers here"))
# First call: trailer log (no trailers), Second call: author log (bot name → skipped)
git_fn = AsyncMock(side_effect=[(0, "no trailers here"), (0, "m3taversal")])
with patch("lib.contributor.config") as mock_config:
mock_config.CONTRIBUTOR_TIER_RULES = {
"veteran": {"claims_merged": 50, "min_days_since_first": 90, "challenges_survived": 5},
@ -213,6 +214,29 @@ def test_record_fallback_to_pr_agent():
assert row is not None
assert row["extractor_count"] == 1
def test_record_fallback_to_git_author():
"""External contributors get credited via git commit author."""
conn = _make_attribution_db()
conn.execute("INSERT INTO prs VALUES (200, 'contrib', 'external')")
mock_diff = "+++ b/domains/ai-alignment/claim.md\n+new content\n"
async def run():
with patch("lib.contributor.get_pr_diff", new_callable=AsyncMock, return_value=mock_diff):
# First call: trailer log (no trailers), Second call: author log (external name)
git_fn = AsyncMock(side_effect=[(0, "no trailers"), (0, "Cameron-S1")])
with patch("lib.contributor.config") as mock_config:
mock_config.CONTRIBUTOR_TIER_RULES = {
"veteran": {"claims_merged": 50, "min_days_since_first": 90, "challenges_survived": 5},
"contributor": {"claims_merged": 10},
}
await record_contributor_attribution(conn, 200, "contrib/cameron/challenge", git_fn)
asyncio.run(run())
row = conn.execute("SELECT * FROM contributors WHERE handle = 'cameron-s1'").fetchone()
assert row is not None
assert row["extractor_count"] == 1
def test_record_parses_pentagon_trailer():
conn = _make_attribution_db()
mock_diff = "+++ b/domains/crypto/claim.md\n+new file content\n"