Two historical GitHub PRs merged before our sync-mirror.sh tracked github_pr: - GitHub PR #68: alexastrum, 6 claims, merged Mar 9 2026 via squash merge - GitHub PR #88: Cameron-S1, 1 claim, merged early April Their claim files were lost during a Forgejo→GitHub mirror overwrite and later recovered via direct-to-main commits (dba00a79, da64f805). Because the recovery commits bypassed the pipeline, our 'prs' table has no row to attach originator events to — all 4 backfill-events.py strategies returned None, leaving Alex + Cameron at 0 originator credits despite real historical work. This reconstructs synthetic 'prs' rows so the existing github_pr strategy in backfill-events.py attaches 7 originator events on re-run: - Numbers 900068 / 900088 live in a clearly-synthetic range that cannot collide with real Forgejo PRs (current max: 3941) - github_pr=68/88 wires up the existing lookup strategy - submitted_by=alexastrum / cameron-s1 establishes author attribution - merged_at from the recovery commit messages (not recovery-commit time) - last_error tags the rows as synthetic for future audits Idempotent: INSERT OR IGNORE via check on number OR github_pr. Safe to replay. Reversible: DELETE FROM prs WHERE number IN (900068, 900088). After applying this script: python3 ops/backfill-events.py will credit Alex with 6 author + 6 originator events (author=1.80, originator=0.90) and Cameron with 1 author + 1 originator (0.30 + 0.15), all dated to the historical merge dates — so 7d/30d leaderboard windows show them correctly. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
10d5c275da
commit
762fd4233e
1 changed files with 135 additions and 0 deletions
135
scripts/backfill-synthetic-recovery-prs.py
Normal file
135
scripts/backfill-synthetic-recovery-prs.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Reconstruct synthetic `prs` rows for historical GitHub PRs lost pre-mirror-wiring.
|
||||
|
||||
Two PRs merged on GitHub before our sync-mirror.sh tracked `github_pr`:
|
||||
- GitHub PR #68: alexastrum — 6 claims, merged 2026-03-09 via GitHub squash,
|
||||
recovered to Forgejo via commit dba00a79 (Apr 16, after mirror erased files)
|
||||
- GitHub PR #88: Cameron-S1 — 1 claim, recovered via commit da64f805
|
||||
|
||||
The recovery commits wrote the files directly to main, so our `prs` table has
|
||||
no row to attach originator events to — the backfill-events.py strategies all
|
||||
return NULL. We reconstruct one synthetic `prs` row per historical GitHub PR so
|
||||
the events pipeline (and `github_pr` strategy in backfill-events) can credit
|
||||
Alex and Cameron properly.
|
||||
|
||||
Numbers 900000+ are clearly synthetic and won't collide with real Forgejo PRs.
|
||||
|
||||
Idempotent via INSERT OR IGNORE.
|
||||
|
||||
Usage:
|
||||
python3 scripts/backfill-synthetic-recovery-prs.py --dry-run
|
||||
python3 scripts/backfill-synthetic-recovery-prs.py
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
||||
|
||||
# Historical GitHub PRs recovered via direct-to-main commits.
|
||||
# Original GitHub merge dates come from the recovery commit messages.
|
||||
RECOVERY_PRS = [
|
||||
{
|
||||
"number": 900068,
|
||||
"github_pr": 68,
|
||||
"branch": "gh-pr-68",
|
||||
"status": "merged",
|
||||
"domain": "ai-alignment",
|
||||
"commit_type": "knowledge",
|
||||
"tier": "STANDARD",
|
||||
"leo_verdict": "approve",
|
||||
"domain_verdict": "approve",
|
||||
"submitted_by": "alexastrum",
|
||||
"source_channel": "github",
|
||||
"description": "Multi-agent git workflows production maturity | Cryptographic agent trust ratings | Defense in depth for AI agent oversight | Deterministic policy engines below LLM layer | Knowledge validation four-layer architecture | Structurally separating proposer and reviewer agents",
|
||||
"merged_at": "2026-03-09 00:00:00",
|
||||
"created_at": "2026-03-08 00:00:00",
|
||||
"last_error": "synthetic_recovery: GitHub PR #68 pre-mirror-wiring reconstruction (commit dba00a79)",
|
||||
},
|
||||
{
|
||||
"number": 900088,
|
||||
"github_pr": 88,
|
||||
"branch": "gh-pr-88",
|
||||
"status": "merged",
|
||||
"domain": "ai-alignment",
|
||||
"commit_type": "knowledge",
|
||||
"tier": "STANDARD",
|
||||
"leo_verdict": "approve",
|
||||
"domain_verdict": "approve",
|
||||
"submitted_by": "cameron-s1",
|
||||
"source_channel": "github",
|
||||
"description": "Orthogonality is an artefact of specification architectures not a property of intelligence itself",
|
||||
"merged_at": "2026-04-01 00:00:00",
|
||||
"created_at": "2026-04-01 00:00:00",
|
||||
"last_error": "synthetic_recovery: GitHub PR #88 pre-mirror-wiring reconstruction (commit da64f805)",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(DB_PATH).exists():
|
||||
print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH, timeout=30)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
# Guard against colliding with real PRs — current max + some headroom.
|
||||
max_real = conn.execute(
|
||||
"SELECT MAX(number) FROM prs WHERE number < 900000"
|
||||
).fetchone()[0] or 0
|
||||
print(f"Max real Forgejo PR number: {max_real}")
|
||||
if max_real >= 900000:
|
||||
print(f"ERROR: real PR numbers reached synthetic range (>= 900000). Aborting.",
|
||||
file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
for row in RECOVERY_PRS:
|
||||
existing = conn.execute(
|
||||
"SELECT number FROM prs WHERE number = ? OR github_pr = ?",
|
||||
(row["number"], row["github_pr"]),
|
||||
).fetchone()
|
||||
if existing:
|
||||
print(f" PR #{row['number']} (github_pr={row['github_pr']}): already exists — skip")
|
||||
skipped += 1
|
||||
continue
|
||||
print(f" {'(dry-run) ' if args.dry_run else ''}INSERT synthetic PR #{row['number']} "
|
||||
f"(github_pr={row['github_pr']}, submitted_by={row['submitted_by']}, "
|
||||
f"merged_at={row['merged_at']})")
|
||||
if not args.dry_run:
|
||||
conn.execute(
|
||||
"""INSERT INTO prs (
|
||||
number, github_pr, branch, status, domain, commit_type, tier,
|
||||
leo_verdict, domain_verdict, submitted_by, source_channel,
|
||||
description, merged_at, created_at, last_error
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
row["number"], row["github_pr"], row["branch"], row["status"],
|
||||
row["domain"], row["commit_type"], row["tier"],
|
||||
row["leo_verdict"], row["domain_verdict"],
|
||||
row["submitted_by"], row["source_channel"],
|
||||
row["description"], row["merged_at"], row["created_at"],
|
||||
row["last_error"],
|
||||
),
|
||||
)
|
||||
inserted += 1
|
||||
|
||||
if not args.dry_run:
|
||||
conn.commit()
|
||||
|
||||
print(f"\nInserted {inserted}, skipped {skipped}")
|
||||
if not args.dry_run and inserted:
|
||||
print("\nNext step: re-run backfill-events.py to attach originator events")
|
||||
print(" python3 ops/backfill-events.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue