From 1decf0959817fd2886cccc79007429da9c1a8737 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Tue, 28 Apr 2026 18:15:13 +0100 Subject: [PATCH] fix(mirror): Step 4.5 writes submitted_by from GitHub user.login MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the systematic external-PR attribution gap diagnosed on FwazB PR #4066. The Forgejo PR was being created via admin-token (m3taversal), so prs.submitted_by ended up as the bot identity. record_contributor_attribution treats m3taversal as a bot and finds no other signal for fork PRs (commit authors are bot-rewritten by sync-mirror), so zero author events emit. Mechanism — for gh-pr-* branches in the auto-create path: 1. After deriving GH_PR_NUM from branch name, GET /pulls/{N} from GitHub API 2. Extract user.login (e.g. "FwazB") from the PR's head author 3. Validate against GitHub username regex: ^[a-zA-Z0-9]([a-zA-Z0-9-]{0,37}[a-zA-Z0-9])?$ 4. Lowercase to match contributor.py canonical handle storage 5. Include submitted_by in the same UPDATE that sets github_pr + source_channel The regex doubles as the SQL-injection safety boundary (no parametric binding in bash sqlite3). 14 cases tested locally including SQL injection probes — all rejected, all real handles pass. Failure modes: - API call fails or returns no user.login → fall back to link-only UPDATE (existing behavior). Better than failing the whole step. - Regex rejects malformed login → same fall-back. Preserves audit trail. Smoke-tested against real FwazB PR #90 on VPS: extracts "FwazB", lowercases to "fwazb", regex passes, would write submitted_by='fwazb'. Once deployed, record_contributor_attribution will trust submitted_by as fallback when no agent trailer found, emit author event with weight 0.30 → FwazB-class contributors auto-attributed end-to-end with zero manual backfill. Architectural decisions per Ship's Apr 28 sign-off: - (a) Sweep stays zero-API: no per-row API calls in Step 0 self-heal. This Step 4.5 fix is create-time only. Existing rows (FwazB) already manually backfilled — no further population to backfill. - (b) Skip _BOT_AUTHORS exception (#2 in original ticket): once submitted_by is correct, the bot filter doesn't fire on external PRs anymore. - (c) Defer frontmatter rewriting: convenience, not load-bearing. --- deploy/sync-mirror.sh | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/deploy/sync-mirror.sh b/deploy/sync-mirror.sh index 7b3a4e2..9963a67 100755 --- a/deploy/sync-mirror.sh +++ b/deploy/sync-mirror.sh @@ -275,9 +275,26 @@ print('no') fi log "Auto-created PR #$PR_NUM on Forgejo for $branch" - # Step 4.5: Link GitHub PR to Forgejo PR in pipeline DB + # Step 4.5: Link GitHub PR to Forgejo PR + populate submitted_by from GitHub user.login. + # + # Why submitted_by here: sync-mirror creates the Forgejo PR using the admin + # token (user=m3taversal), so the Forgejo PR's submitter is bot-identity, not + # the GitHub author (FwazB, etc). Without this fix, contribution_events emits + # zero author events for external PRs because record_contributor_attribution + # treats m3taversal as a bot and finds no other signal. Architectural fix per + # Ship + Cory after the FwazB PR #4066 attribution gap (Apr 28). + local GH_USER + GH_USER="" if [[ "$branch" == gh-pr-* ]]; then GH_PR_NUM=$(echo "$branch" | sed 's|gh-pr-\([0-9]*\)/.*|\1|') + # Look up GitHub PR author for submitted_by attribution. + local PAT_GHU + PAT_GHU=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]') + if [ -n "$PAT_GHU" ] && [[ "$GH_PR_NUM" =~ ^[0-9]+$ ]]; then + GH_USER=$(curl -sf "https://api.github.com/repos/$GITHUB_REPO/pulls/$GH_PR_NUM" \ + -H "Authorization: token $PAT_GHU" 2>/dev/null | \ + python3 -c "import sys,json; print((json.load(sys.stdin).get('user') or {}).get('login',''))" 2>/dev/null || true) + fi else local PAT PAT=$(cat "$GITHUB_PAT_FILE" 2>/dev/null | tr -d '[:space:]') @@ -289,9 +306,24 @@ print('no') fi fi if [[ "$GH_PR_NUM" =~ ^[0-9]+$ ]] && [[ "$PR_NUM" =~ ^[0-9]+$ ]]; then - sqlite3 "$PIPELINE_DB" "UPDATE prs SET github_pr = $GH_PR_NUM, source_channel = 'github' WHERE number = $PR_NUM;" 2>/dev/null && \ - log "Linked GitHub PR #$GH_PR_NUM -> Forgejo PR #$PR_NUM" || \ - log "WARN: Failed to link GitHub PR #$GH_PR_NUM to Forgejo PR #$PR_NUM in DB" + # GitHub usernames: 1-39 chars, alphanumeric + hyphen, can't start/end with hyphen. + # Strict regex is the SQL-injection safety boundary (no bash sqlite3 parametric binding). + if [[ "$GH_USER" =~ ^[a-zA-Z0-9]([a-zA-Z0-9-]{0,37}[a-zA-Z0-9])?$ ]]; then + # Lowercase for canonical handle storage (matches contributor.py normalize) + local GH_USER_LC + GH_USER_LC=$(echo "$GH_USER" | tr '[:upper:]' '[:lower:]') + sqlite3 "$PIPELINE_DB" "UPDATE prs SET github_pr = $GH_PR_NUM, source_channel = 'github', submitted_by = '$GH_USER_LC' WHERE number = $PR_NUM;" 2>/dev/null && \ + log "Linked GitHub PR #$GH_PR_NUM -> Forgejo PR #$PR_NUM (author: $GH_USER_LC)" || \ + log "WARN: Failed to link GitHub PR #$GH_PR_NUM to Forgejo PR #$PR_NUM in DB" + else + # Fall back to link-only when user.login lookup failed or was malformed. + # Still better than no link; record_contributor_attribution can use + # other signals (commit author trailer, etc) — author event may still + # not emit, but submitted_by stays NULL/whatever discovery set. + sqlite3 "$PIPELINE_DB" "UPDATE prs SET github_pr = $GH_PR_NUM, source_channel = 'github' WHERE number = $PR_NUM;" 2>/dev/null && \ + log "Linked GitHub PR #$GH_PR_NUM -> Forgejo PR #$PR_NUM (author lookup failed)" || \ + log "WARN: Failed to link GitHub PR #$GH_PR_NUM to Forgejo PR #$PR_NUM in DB" + fi fi done }