diff --git a/scripts/backfill-synthetic-recovery-prs.py b/scripts/backfill-synthetic-recovery-prs.py new file mode 100644 index 0000000..2299f39 --- /dev/null +++ b/scripts/backfill-synthetic-recovery-prs.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Reconstruct synthetic `prs` rows for historical GitHub PRs lost pre-mirror-wiring. + +Two PRs merged on GitHub before our sync-mirror.sh tracked `github_pr`: + - GitHub PR #68: alexastrum — 6 claims, merged 2026-03-09 via GitHub squash, + recovered to Forgejo via commit dba00a79 (Apr 16, after mirror erased files) + - GitHub PR #88: Cameron-S1 — 1 claim, recovered via commit da64f805 + +The recovery commits wrote the files directly to main, so our `prs` table has +no row to attach originator events to — the backfill-events.py strategies all +return NULL. We reconstruct one synthetic `prs` row per historical GitHub PR so +the events pipeline (and `github_pr` strategy in backfill-events) can credit +Alex and Cameron properly. + +Numbers 900000+ are clearly synthetic and won't collide with real Forgejo PRs. + +Idempotent via INSERT OR IGNORE. + +Usage: + python3 scripts/backfill-synthetic-recovery-prs.py --dry-run + python3 scripts/backfill-synthetic-recovery-prs.py +""" +import argparse +import os +import sqlite3 +import sys +from pathlib import Path + +DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db") + +# Historical GitHub PRs recovered via direct-to-main commits. +# Original GitHub merge dates come from the recovery commit messages. +RECOVERY_PRS = [ + { + "number": 900068, + "github_pr": 68, + "branch": "gh-pr-68", + "status": "merged", + "domain": "ai-alignment", + "commit_type": "knowledge", + "tier": "STANDARD", + "leo_verdict": "approve", + "domain_verdict": "approve", + "submitted_by": "alexastrum", + "source_channel": "github", + "description": "Multi-agent git workflows production maturity | Cryptographic agent trust ratings | Defense in depth for AI agent oversight | Deterministic policy engines below LLM layer | Knowledge validation four-layer architecture | Structurally separating proposer and reviewer agents", + "merged_at": "2026-03-09 00:00:00", + "created_at": "2026-03-08 00:00:00", + "last_error": "synthetic_recovery: GitHub PR #68 pre-mirror-wiring reconstruction (commit dba00a79)", + }, + { + "number": 900088, + "github_pr": 88, + "branch": "gh-pr-88", + "status": "merged", + "domain": "ai-alignment", + "commit_type": "knowledge", + "tier": "STANDARD", + "leo_verdict": "approve", + "domain_verdict": "approve", + "submitted_by": "cameron-s1", + "source_channel": "github", + "description": "Orthogonality is an artefact of specification architectures not a property of intelligence itself", + "merged_at": "2026-04-01 00:00:00", + "created_at": "2026-04-01 00:00:00", + "last_error": "synthetic_recovery: GitHub PR #88 pre-mirror-wiring reconstruction (commit da64f805)", + }, +] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + if not Path(DB_PATH).exists(): + print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr) + sys.exit(1) + + conn = sqlite3.connect(DB_PATH, timeout=30) + conn.row_factory = sqlite3.Row + + # Guard against colliding with real PRs — current max + some headroom. + max_real = conn.execute( + "SELECT MAX(number) FROM prs WHERE number < 900000" + ).fetchone()[0] or 0 + print(f"Max real Forgejo PR number: {max_real}") + if max_real >= 900000: + print(f"ERROR: real PR numbers reached synthetic range (>= 900000). Aborting.", + file=sys.stderr) + sys.exit(2) + + inserted = 0 + skipped = 0 + for row in RECOVERY_PRS: + existing = conn.execute( + "SELECT number FROM prs WHERE number = ? OR github_pr = ?", + (row["number"], row["github_pr"]), + ).fetchone() + if existing: + print(f" PR #{row['number']} (github_pr={row['github_pr']}): already exists — skip") + skipped += 1 + continue + print(f" {'(dry-run) ' if args.dry_run else ''}INSERT synthetic PR #{row['number']} " + f"(github_pr={row['github_pr']}, submitted_by={row['submitted_by']}, " + f"merged_at={row['merged_at']})") + if not args.dry_run: + conn.execute( + """INSERT INTO prs ( + number, github_pr, branch, status, domain, commit_type, tier, + leo_verdict, domain_verdict, submitted_by, source_channel, + description, merged_at, created_at, last_error + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + row["number"], row["github_pr"], row["branch"], row["status"], + row["domain"], row["commit_type"], row["tier"], + row["leo_verdict"], row["domain_verdict"], + row["submitted_by"], row["source_channel"], + row["description"], row["merged_at"], row["created_at"], + row["last_error"], + ), + ) + inserted += 1 + + if not args.dry_run: + conn.commit() + + print(f"\nInserted {inserted}, skipped {skipped}") + if not args.dry_run and inserted: + print("\nNext step: re-run backfill-events.py to attach originator events") + print(" python3 ops/backfill-events.py") + + +if __name__ == "__main__": + main()