teleo-infrastructure/scripts/backfill-synthetic-recovery-prs.py
m3taversal 0f2b153c92
Some checks are pending
CI / lint-and-test (push) Waiting to run
fix(backfill): Ganymede review — fix tautological guard + origin='human'
Addresses two findings in commit 762fd42 review:

1. BUG: guard query was tautological. `SELECT MAX(number) FROM prs WHERE
   number < 900000` filters out exactly what the `>= 900000` check tests.
   Replaced with a direct check for unexpected rows in the synthetic range
   (excluding our known 900068/900088).

2. WARNING: origin defaults to 'pipeline' via schema default. lib/merge.py
   convention is origin='human' for external contributors. Synthetic rows
   now set origin='human', priority='high' — matches discover_external_prs
   for real GitHub PRs. Prevents Phase B origin-based filtering from
   misclassifying Alex/Cameron as machine-authored.

Also flagged in review: credit projection was optimistic. Author events are
PR-level (not per-claim), so Alex gets 1×0.30 author credit, not 6. Same
for Cameron. Per-claim originator credit goes to the 7 frontmatter sourcers
where applicable. Not a code change — expectation reset for Cory.
2026-04-24 16:49:12 +01:00

148 lines
6 KiB
Python

#!/usr/bin/env python3
"""Reconstruct synthetic `prs` rows for historical GitHub PRs lost pre-mirror-wiring.
Two PRs merged on GitHub before our sync-mirror.sh tracked `github_pr`:
- GitHub PR #68: alexastrum — 6 claims, merged 2026-03-09 via GitHub squash,
recovered to Forgejo via commit dba00a79 (Apr 16, after mirror erased files)
- GitHub PR #88: Cameron-S1 — 1 claim, recovered via commit da64f805
The recovery commits wrote the files directly to main, so our `prs` table has
no row to attach originator events to — the backfill-events.py strategies all
return NULL. We reconstruct one synthetic `prs` row per historical GitHub PR so
the events pipeline (and `github_pr` strategy in backfill-events) can credit
Alex and Cameron properly.
Numbers 900000+ are clearly synthetic and won't collide with real Forgejo PRs.
Idempotent via INSERT OR IGNORE.
Usage:
python3 scripts/backfill-synthetic-recovery-prs.py --dry-run
python3 scripts/backfill-synthetic-recovery-prs.py
"""
import argparse
import os
import sqlite3
import sys
from pathlib import Path
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
# Historical GitHub PRs recovered via direct-to-main commits.
# Original GitHub merge dates come from the recovery commit messages.
RECOVERY_PRS = [
{
"number": 900068,
"github_pr": 68,
"branch": "gh-pr-68",
"status": "merged",
"domain": "ai-alignment",
"commit_type": "knowledge",
"tier": "STANDARD",
"leo_verdict": "approve",
"domain_verdict": "approve",
"submitted_by": "alexastrum",
"source_channel": "github",
# origin='human' matches lib/merge.py convention for external contributors
# (default is 'pipeline' which misclassifies us as machine-authored).
"origin": "human",
"priority": "high",
"description": "Multi-agent git workflows production maturity | Cryptographic agent trust ratings | Defense in depth for AI agent oversight | Deterministic policy engines below LLM layer | Knowledge validation four-layer architecture | Structurally separating proposer and reviewer agents",
"merged_at": "2026-03-09 00:00:00",
"created_at": "2026-03-08 00:00:00",
"last_error": "synthetic_recovery: GitHub PR #68 pre-mirror-wiring reconstruction (commit dba00a79)",
},
{
"number": 900088,
"github_pr": 88,
"branch": "gh-pr-88",
"status": "merged",
"domain": "ai-alignment",
"commit_type": "knowledge",
"tier": "STANDARD",
"leo_verdict": "approve",
"domain_verdict": "approve",
"submitted_by": "cameron-s1",
"source_channel": "github",
"origin": "human",
"priority": "high",
"description": "Orthogonality is an artefact of specification architectures not a property of intelligence itself",
"merged_at": "2026-04-01 00:00:00",
"created_at": "2026-04-01 00:00:00",
"last_error": "synthetic_recovery: GitHub PR #88 pre-mirror-wiring reconstruction (commit da64f805)",
},
]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not Path(DB_PATH).exists():
print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
sys.exit(1)
conn = sqlite3.connect(DB_PATH, timeout=30)
conn.row_factory = sqlite3.Row
# Guard against synthetic-range colonization (Ganymede review): check for
# any row in the synthetic range that isn't one of ours. INSERT OR IGNORE on
# the specific numbers is the real collision defense; this is belt-and-suspenders.
max_real = conn.execute(
"SELECT MAX(number) FROM prs WHERE number < 900000"
).fetchone()[0] or 0
print(f"Max real Forgejo PR number: {max_real}")
synth_conflict = conn.execute(
"SELECT number FROM prs WHERE number >= 900000 AND number NOT IN (900068, 900088) LIMIT 1"
).fetchone()
if synth_conflict:
print(f"ERROR: PR #{synth_conflict[0]} already exists in synthetic range. "
f"Pick a new range before running.", file=sys.stderr)
sys.exit(2)
inserted = 0
skipped = 0
for row in RECOVERY_PRS:
existing = conn.execute(
"SELECT number FROM prs WHERE number = ? OR github_pr = ?",
(row["number"], row["github_pr"]),
).fetchone()
if existing:
print(f" PR #{row['number']} (github_pr={row['github_pr']}): already exists — skip")
skipped += 1
continue
print(f" {'(dry-run) ' if args.dry_run else ''}INSERT synthetic PR #{row['number']} "
f"(github_pr={row['github_pr']}, submitted_by={row['submitted_by']}, "
f"merged_at={row['merged_at']})")
if not args.dry_run:
conn.execute(
"""INSERT INTO prs (
number, github_pr, branch, status, domain, commit_type, tier,
leo_verdict, domain_verdict, submitted_by, source_channel,
origin, priority,
description, merged_at, created_at, last_error
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
row["number"], row["github_pr"], row["branch"], row["status"],
row["domain"], row["commit_type"], row["tier"],
row["leo_verdict"], row["domain_verdict"],
row["submitted_by"], row["source_channel"],
row["origin"], row["priority"],
row["description"], row["merged_at"], row["created_at"],
row["last_error"],
),
)
inserted += 1
if not args.dry_run:
conn.commit()
print(f"\nInserted {inserted}, skipped {skipped}")
if not args.dry_run and inserted:
print("\nNext step: re-run backfill-events.py to attach originator events")
print(" python3 ops/backfill-events.py")
if __name__ == "__main__":
main()