Some checks are pending
CI / lint-and-test (push) Waiting to run
Addresses two findings in commit 762fd42 review:
1. BUG: guard query was tautological. `SELECT MAX(number) FROM prs WHERE
number < 900000` filters out exactly what the `>= 900000` check tests.
Replaced with a direct check for unexpected rows in the synthetic range
(excluding our known 900068/900088).
2. WARNING: origin defaults to 'pipeline' via schema default. lib/merge.py
convention is origin='human' for external contributors. Synthetic rows
now set origin='human', priority='high' — matches discover_external_prs
for real GitHub PRs. Prevents Phase B origin-based filtering from
misclassifying Alex/Cameron as machine-authored.
Also flagged in review: credit projection was optimistic. Author events are
PR-level (not per-claim), so Alex gets 1×0.30 author credit, not 6. Same
for Cameron. Per-claim originator credit goes to the 7 frontmatter sourcers
where applicable. Not a code change — expectation reset for Cory.
148 lines
6 KiB
Python
148 lines
6 KiB
Python
#!/usr/bin/env python3
|
|
"""Reconstruct synthetic `prs` rows for historical GitHub PRs lost pre-mirror-wiring.
|
|
|
|
Two PRs merged on GitHub before our sync-mirror.sh tracked `github_pr`:
|
|
- GitHub PR #68: alexastrum — 6 claims, merged 2026-03-09 via GitHub squash,
|
|
recovered to Forgejo via commit dba00a79 (Apr 16, after mirror erased files)
|
|
- GitHub PR #88: Cameron-S1 — 1 claim, recovered via commit da64f805
|
|
|
|
The recovery commits wrote the files directly to main, so our `prs` table has
|
|
no row to attach originator events to — the backfill-events.py strategies all
|
|
return NULL. We reconstruct one synthetic `prs` row per historical GitHub PR so
|
|
the events pipeline (and `github_pr` strategy in backfill-events) can credit
|
|
Alex and Cameron properly.
|
|
|
|
Numbers 900000+ are clearly synthetic and won't collide with real Forgejo PRs.
|
|
|
|
Idempotent via INSERT OR IGNORE.
|
|
|
|
Usage:
|
|
python3 scripts/backfill-synthetic-recovery-prs.py --dry-run
|
|
python3 scripts/backfill-synthetic-recovery-prs.py
|
|
"""
|
|
import argparse
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
DB_PATH = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
|
|
|
# Historical GitHub PRs recovered via direct-to-main commits.
|
|
# Original GitHub merge dates come from the recovery commit messages.
|
|
RECOVERY_PRS = [
|
|
{
|
|
"number": 900068,
|
|
"github_pr": 68,
|
|
"branch": "gh-pr-68",
|
|
"status": "merged",
|
|
"domain": "ai-alignment",
|
|
"commit_type": "knowledge",
|
|
"tier": "STANDARD",
|
|
"leo_verdict": "approve",
|
|
"domain_verdict": "approve",
|
|
"submitted_by": "alexastrum",
|
|
"source_channel": "github",
|
|
# origin='human' matches lib/merge.py convention for external contributors
|
|
# (default is 'pipeline' which misclassifies us as machine-authored).
|
|
"origin": "human",
|
|
"priority": "high",
|
|
"description": "Multi-agent git workflows production maturity | Cryptographic agent trust ratings | Defense in depth for AI agent oversight | Deterministic policy engines below LLM layer | Knowledge validation four-layer architecture | Structurally separating proposer and reviewer agents",
|
|
"merged_at": "2026-03-09 00:00:00",
|
|
"created_at": "2026-03-08 00:00:00",
|
|
"last_error": "synthetic_recovery: GitHub PR #68 pre-mirror-wiring reconstruction (commit dba00a79)",
|
|
},
|
|
{
|
|
"number": 900088,
|
|
"github_pr": 88,
|
|
"branch": "gh-pr-88",
|
|
"status": "merged",
|
|
"domain": "ai-alignment",
|
|
"commit_type": "knowledge",
|
|
"tier": "STANDARD",
|
|
"leo_verdict": "approve",
|
|
"domain_verdict": "approve",
|
|
"submitted_by": "cameron-s1",
|
|
"source_channel": "github",
|
|
"origin": "human",
|
|
"priority": "high",
|
|
"description": "Orthogonality is an artefact of specification architectures not a property of intelligence itself",
|
|
"merged_at": "2026-04-01 00:00:00",
|
|
"created_at": "2026-04-01 00:00:00",
|
|
"last_error": "synthetic_recovery: GitHub PR #88 pre-mirror-wiring reconstruction (commit da64f805)",
|
|
},
|
|
]
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
if not Path(DB_PATH).exists():
|
|
print(f"ERROR: DB not found at {DB_PATH}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
conn = sqlite3.connect(DB_PATH, timeout=30)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Guard against synthetic-range colonization (Ganymede review): check for
|
|
# any row in the synthetic range that isn't one of ours. INSERT OR IGNORE on
|
|
# the specific numbers is the real collision defense; this is belt-and-suspenders.
|
|
max_real = conn.execute(
|
|
"SELECT MAX(number) FROM prs WHERE number < 900000"
|
|
).fetchone()[0] or 0
|
|
print(f"Max real Forgejo PR number: {max_real}")
|
|
synth_conflict = conn.execute(
|
|
"SELECT number FROM prs WHERE number >= 900000 AND number NOT IN (900068, 900088) LIMIT 1"
|
|
).fetchone()
|
|
if synth_conflict:
|
|
print(f"ERROR: PR #{synth_conflict[0]} already exists in synthetic range. "
|
|
f"Pick a new range before running.", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
inserted = 0
|
|
skipped = 0
|
|
for row in RECOVERY_PRS:
|
|
existing = conn.execute(
|
|
"SELECT number FROM prs WHERE number = ? OR github_pr = ?",
|
|
(row["number"], row["github_pr"]),
|
|
).fetchone()
|
|
if existing:
|
|
print(f" PR #{row['number']} (github_pr={row['github_pr']}): already exists — skip")
|
|
skipped += 1
|
|
continue
|
|
print(f" {'(dry-run) ' if args.dry_run else ''}INSERT synthetic PR #{row['number']} "
|
|
f"(github_pr={row['github_pr']}, submitted_by={row['submitted_by']}, "
|
|
f"merged_at={row['merged_at']})")
|
|
if not args.dry_run:
|
|
conn.execute(
|
|
"""INSERT INTO prs (
|
|
number, github_pr, branch, status, domain, commit_type, tier,
|
|
leo_verdict, domain_verdict, submitted_by, source_channel,
|
|
origin, priority,
|
|
description, merged_at, created_at, last_error
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
(
|
|
row["number"], row["github_pr"], row["branch"], row["status"],
|
|
row["domain"], row["commit_type"], row["tier"],
|
|
row["leo_verdict"], row["domain_verdict"],
|
|
row["submitted_by"], row["source_channel"],
|
|
row["origin"], row["priority"],
|
|
row["description"], row["merged_at"], row["created_at"],
|
|
row["last_error"],
|
|
),
|
|
)
|
|
inserted += 1
|
|
|
|
if not args.dry_run:
|
|
conn.commit()
|
|
|
|
print(f"\nInserted {inserted}, skipped {skipped}")
|
|
if not args.dry_run and inserted:
|
|
print("\nNext step: re-run backfill-events.py to attach originator events")
|
|
print(" python3 ops/backfill-events.py")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|