Two historical GitHub PRs merged before our sync-mirror.sh tracked github_pr: - GitHub PR #68: alexastrum, 6 claims, merged Mar 9 2026 via squash merge - GitHub PR #88: Cameron-S1, 1 claim, merged early April Their claim files were lost during a Forgejo→GitHub mirror overwrite and later recovered via direct-to-main commits (dba00a79, da64f805). Because the recovery commits bypassed the pipeline, our 'prs' table has no row to attach originator events to — all 4 backfill-events.py strategies returned None, leaving Alex + Cameron at 0 originator credits despite real historical work. This reconstructs synthetic 'prs' rows so the existing github_pr strategy in backfill-events.py attaches 7 originator events on re-run: - Numbers 900068 / 900088 live in a clearly-synthetic range that cannot collide with real Forgejo PRs (current max: 3941) - github_pr=68/88 wires up the existing lookup strategy - submitted_by=alexastrum / cameron-s1 establishes author attribution - merged_at from the recovery commit messages (not recovery-commit time) - last_error tags the rows as synthetic for future audits Idempotent: INSERT OR IGNORE via check on number OR github_pr. Safe to replay. Reversible: DELETE FROM prs WHERE number IN (900068, 900088). After applying this script: python3 ops/backfill-events.py will credit Alex with 6 author + 6 originator events (author=1.80, originator=0.90) and Cameron with 1 author + 1 originator (0.30 + 0.15), all dated to the historical merge dates — so 7d/30d leaderboard windows show them correctly. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
10d5c275da
commit
762fd4233e
1 changed files with 135 additions and 0 deletions
135
scripts/backfill-synthetic-recovery-prs.py
Normal file
135
scripts/backfill-synthetic-recovery-prs.py
Normal file
|
|
@ -0,0 +1,135 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Reconstruct synthetic `prs` rows for historical GitHub PRs lost pre-mirror-wiring.
|
||||||
|
|
||||||
|
Two PRs merged on GitHub before our sync-mirror.sh tracked `github_pr`:
|
||||||
|
- GitHub PR #68: alexastrum — 6 claims, merged 2026-03-09 via GitHub squash,
|
||||||
|
recovered to Forgejo via commit dba00a79 (Apr 16, after mirror erased files)
|
||||||
|
- GitHub PR #88: Cameron-S1 — 1 claim, recovered via commit da64f805
|
||||||
|
|
||||||
|
The recovery commits wrote the files directly to main, so our `prs` table has
|
||||||
|
no row to attach originator events to — the backfill-events.py strategies all
|
||||||
|
return NULL. We reconstruct one synthetic `prs` row per historical GitHub PR so
|
||||||
|
the events pipeline (and `github_pr` strategy in backfill-events) can credit
|
||||||
|
Alex and Cameron properly.
|
||||||
|
|
||||||
|
Numbers 900000+ are clearly synthetic and won't collide with real Forgejo PRs.
|
||||||
|
|
||||||
|
Idempotent via INSERT OR IGNORE.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/backfill-synthetic-recovery-prs.py --dry-run
|
||||||
|
python3 scripts/backfill-synthetic-recovery-prs.py
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
||||||
|
|
||||||
|
# Historical GitHub PRs recovered via direct-to-main commits.
|
||||||
|
# Original GitHub merge dates come from the recovery commit messages.
|
||||||
|
RECOVERY_PRS = [
|
||||||
|
{
|
||||||
|
"number": 900068,
|
||||||
|
"github_pr": 68,
|
||||||
|
"branch": "gh-pr-68",
|
||||||
|
"status": "merged",
|
||||||
|
"domain": "ai-alignment",
|
||||||
|
"commit_type": "knowledge",
|
||||||
|
"tier": "STANDARD",
|
||||||
|
"leo_verdict": "approve",
|
||||||
|
"domain_verdict": "approve",
|
||||||
|
"submitted_by": "alexastrum",
|
||||||
|
"source_channel": "github",
|
||||||
|
"description": "Multi-agent git workflows production maturity | Cryptographic agent trust ratings | Defense in depth for AI agent oversight | Deterministic policy engines below LLM layer | Knowledge validation four-layer architecture | Structurally separating proposer and reviewer agents",
|
||||||
|
"merged_at": "2026-03-09 00:00:00",
|
||||||
|
"created_at": "2026-03-08 00:00:00",
|
||||||
|
"last_error": "synthetic_recovery: GitHub PR #68 pre-mirror-wiring reconstruction (commit dba00a79)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"number": 900088,
|
||||||
|
"github_pr": 88,
|
||||||
|
"branch": "gh-pr-88",
|
||||||
|
"status": "merged",
|
||||||
|
"domain": "ai-alignment",
|
||||||
|
"commit_type": "knowledge",
|
||||||
|
"tier": "STANDARD",
|
||||||
|
"leo_verdict": "approve",
|
||||||
|
"domain_verdict": "approve",
|
||||||
|
"submitted_by": "cameron-s1",
|
||||||
|
"source_channel": "github",
|
||||||
|
"description": "Orthogonality is an artefact of specification architectures not a property of intelligence itself",
|
||||||
|
"merged_at": "2026-04-01 00:00:00",
|
||||||
|
"created_at": "2026-04-01 00:00:00",
|
||||||
|
"last_error": "synthetic_recovery: GitHub PR #88 pre-mirror-wiring reconstruction (commit da64f805)",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--dry-run", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not Path(DB_PATH).exists():
|
||||||
|
print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(DB_PATH, timeout=30)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
# Guard against colliding with real PRs — current max + some headroom.
|
||||||
|
max_real = conn.execute(
|
||||||
|
"SELECT MAX(number) FROM prs WHERE number < 900000"
|
||||||
|
).fetchone()[0] or 0
|
||||||
|
print(f"Max real Forgejo PR number: {max_real}")
|
||||||
|
if max_real >= 900000:
|
||||||
|
print(f"ERROR: real PR numbers reached synthetic range (>= 900000). Aborting.",
|
||||||
|
file=sys.stderr)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
inserted = 0
|
||||||
|
skipped = 0
|
||||||
|
for row in RECOVERY_PRS:
|
||||||
|
existing = conn.execute(
|
||||||
|
"SELECT number FROM prs WHERE number = ? OR github_pr = ?",
|
||||||
|
(row["number"], row["github_pr"]),
|
||||||
|
).fetchone()
|
||||||
|
if existing:
|
||||||
|
print(f" PR #{row['number']} (github_pr={row['github_pr']}): already exists — skip")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
print(f" {'(dry-run) ' if args.dry_run else ''}INSERT synthetic PR #{row['number']} "
|
||||||
|
f"(github_pr={row['github_pr']}, submitted_by={row['submitted_by']}, "
|
||||||
|
f"merged_at={row['merged_at']})")
|
||||||
|
if not args.dry_run:
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO prs (
|
||||||
|
number, github_pr, branch, status, domain, commit_type, tier,
|
||||||
|
leo_verdict, domain_verdict, submitted_by, source_channel,
|
||||||
|
description, merged_at, created_at, last_error
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(
|
||||||
|
row["number"], row["github_pr"], row["branch"], row["status"],
|
||||||
|
row["domain"], row["commit_type"], row["tier"],
|
||||||
|
row["leo_verdict"], row["domain_verdict"],
|
||||||
|
row["submitted_by"], row["source_channel"],
|
||||||
|
row["description"], row["merged_at"], row["created_at"],
|
||||||
|
row["last_error"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
inserted += 1
|
||||||
|
|
||||||
|
if not args.dry_run:
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
print(f"\nInserted {inserted}, skipped {skipped}")
|
||||||
|
if not args.dry_run and inserted:
|
||||||
|
print("\nNext step: re-run backfill-events.py to attach originator events")
|
||||||
|
print(" python3 ops/backfill-events.py")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue