From 1d6b51527a439c892459e8cbf53e2c437218437c Mon Sep 17 00:00:00 2001 From: m3taversal Date: Fri, 24 Apr 2026 16:06:52 +0100 Subject: [PATCH] feat(backfill): 4-strategy PR recovery for originator events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite claim-level pass in backfill-events.py to recover the Forgejo PR that introduced each claim via a cascade of 4 strategies (reliability order), replacing the single title→description match that missed PRs with NULL description (Cameron #3377) and bare-subject extracts (Shaga's Leo research PR). ## Strategies 1. sourced_from frontmatter → prs.source_path stem match 2. git log first-add commit → subject pattern → prs.branch - ": extract claims from " → extract/ - ": research session YYYY-MM-DD" → /research- - ": (challenge|contrib|entity|synthesize)" → /* - "Recover X from GitHub PR #N" → prs.github_pr=N - "Extract N claims from X" (no prefix) → time-proximity on agent-owned branches within 24h 3. Current title_desc fallback for anything the above miss ## Dry-run projection (1,662 merged PRs) Before: Claims processed: 33 Originator events: 6 Breakdown: {no_pr_match: 1608, no_sourcer: 26, invalid_handle: 21, skip_self: 6} After: Claims processed: 505 (+472) Originator events: 126 (+120) Strategy hits: git_subject=412, sourced_from=88, git_time_proximity=5 Breakdown: {no_pr_match: 1095, no_sourcer: 67, invalid_handle: 359, skip_self: 20} ## Verified on real VPS data - @thesensatore claims: 3/5 resolve via git_time_proximity to leo/ PRs - Cameron-S1, alexastrum: remain None — their recovery commits (dba00a79, da64f805) bypassed the pipeline entirely, no Forgejo PR record exists. Requires synthetic prs rows — deferred to separate commit with its own Ganymede review (write operation, larger blast radius than this pure-read backfill change). ## Implementation - New find_pr_for_claim(conn, repo, md) helper returns (pr_number, strategy) - Claim-level pass uses it first, falls back to title_desc map - Strategy counter surfaced in summary output for operator visibility Idempotent — backfill re-runs skip duplicate events via the partial UNIQUE index on contribution_events. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/backfill-events.py | 195 ++++++++++++++++++++++++++++++++++--- 1 file changed, 184 insertions(+), 11 deletions(-) diff --git a/scripts/backfill-events.py b/scripts/backfill-events.py index 0a14543..5b596ab 100644 --- a/scripts/backfill-events.py +++ b/scripts/backfill-events.py @@ -199,6 +199,168 @@ def derive_author(conn: sqlite3.Connection, pr: dict) -> str | None: return None +def find_pr_for_claim( + conn: sqlite3.Connection, + repo: Path, + md: Path, +) -> tuple[int | None, str]: + """Recover the Forgejo PR number that introduced a claim file. + + Returns (pr_number, strategy) — strategy is one of: + 'sourced_from' — frontmatter sourced_from matched prs.source_path + 'git_subject' — git log first-add commit message matched a branch pattern + 'title_desc' — filename stem matched a title in prs.description + 'github_pr' — recovery commit mentioned GitHub PR # → prs.github_pr + 'none' — no strategy found a match + + Order is chosen by reliability: + 1. sourced_from (explicit provenance, most reliable when present) + 2. git_subject (covers Leo research, Cameron challenges, Theseus contrib) + 3. title_desc (current fallback — brittle when description is NULL) + 4. github_pr (recovery commits referencing erased GitHub PRs) + """ + rel = str(md.relative_to(repo)) + + # Strategy 1: sourced_from frontmatter → prs.source_path + try: + content = md.read_text(encoding="utf-8") + except (FileNotFoundError, PermissionError, UnicodeDecodeError): + content = "" + fm = parse_frontmatter(content) if content else None + if fm: + sourced = fm.get("sourced_from") + candidate_paths: list[str] = [] + if isinstance(sourced, str) and sourced: + candidate_paths.append(sourced) + elif isinstance(sourced, list): + candidate_paths.extend(s for s in sourced if isinstance(s, str)) + for sp in candidate_paths: + stem = Path(sp).stem + if not stem: + continue + row = conn.execute( + """SELECT number FROM prs + WHERE source_path LIKE ? AND status='merged' + ORDER BY merged_at ASC LIMIT 1""", + (f"%{stem}.md",), + ).fetchone() + if row: + return row["number"], "sourced_from" + + # Strategy 2: git log first-add commit → subject pattern → prs.branch + # Default log order is reverse-chronological; take the last line (oldest) + # to get the original addition, not later rewrites. + log_out = git( + "log", "--diff-filter=A", "--follow", + "--format=%H|||%s|||%b", "--", rel, + ) + if log_out.strip(): + # Split on the delimiter we chose. Each commit produces 3 fields but + # %b can contain blank lines — group by lines that look like a SHA. + blocks: list[tuple[str, str, str]] = [] + current: list[str] = [] + for line in log_out.splitlines(): + if re.match(r"^[a-f0-9]{40}\|\|\|", line): + if current: + parts = "\n".join(current).split("|||", 2) + if len(parts) == 3: + blocks.append((parts[0], parts[1], parts[2])) + current = [line] + else: + current.append(line) + if current: + parts = "\n".join(current).split("|||", 2) + if len(parts) == 3: + blocks.append((parts[0], parts[1], parts[2])) + if blocks: + # Oldest addition — git log defaults to reverse-chronological + _oldest_sha, subject, body = blocks[-1] + + # Pattern: ": extract claims from " + m = re.match(r"^(\w+):\s*extract\s+claims\s+from\s+(\S+)", subject) + if m: + slug = m.group(2).rstrip(".md").rstrip(".") + row = conn.execute( + """SELECT number FROM prs + WHERE branch LIKE ? AND status='merged' + ORDER BY merged_at ASC LIMIT 1""", + (f"extract/{slug}%",), + ).fetchone() + if row: + return row["number"], "git_subject" + + # Pattern: ": research session " + m = re.match(r"^(\w+):\s*research\s+session\s+(\d{4}-\d{2}-\d{2})", subject) + if m: + agent = m.group(1).lower() + date = m.group(2) + row = conn.execute( + """SELECT number FROM prs + WHERE branch LIKE ? AND status='merged' + ORDER BY merged_at ASC LIMIT 1""", + (f"{agent}/research-{date}%",), + ).fetchone() + if row: + return row["number"], "git_subject" + + # Pattern: ": challenge" / contrib challenges / entity batches + m = re.match(r"^(\w+):\s*(?:challenge|contrib|entity|synthesize)", subject) + if m: + agent = m.group(1).lower() + row = conn.execute( + """SELECT number FROM prs + WHERE branch LIKE ? AND status='merged' + ORDER BY merged_at ASC LIMIT 1""", + (f"{agent}/%",), + ).fetchone() + if row: + return row["number"], "git_subject" + + # Recovery commits referencing erased GitHub PRs (Alex/Cameron). + # Subject: "Recover contribution from GitHub PR #NN (...)". + # Match only when a corresponding prs row exists with github_pr=NN — + # otherwise the claims were direct-to-main without a Forgejo PR + # record, which requires a synthetic PR row (follow-up, not in + # this script's scope). + gh_match = re.search(r"GitHub\s+PR\s+#(\d+)", subject + "\n" + body) + if gh_match: + gh_pr = int(gh_match.group(1)) + row = conn.execute( + "SELECT number FROM prs WHERE github_pr = ? AND status='merged' LIMIT 1", + (gh_pr,), + ).fetchone() + if row: + return row["number"], "github_pr" + + # Pattern: bare "Extract N claims from " (no + # agent prefix). Used in early research PRs like Shaga's claims + # at PR #2025. Fall back to time-proximity: find the earliest + # agent-branch PR merged within 24h AFTER this commit's date. + m = re.match(r"^Extract\s+\d+\s+claims\s+from\b", subject) + if m: + # Get commit author date + date_out = git( + "log", "-1", "--format=%aI", _oldest_sha, timeout=10, + ) + commit_date = date_out.strip() if date_out.strip() else None + if commit_date: + row = conn.execute( + """SELECT number FROM prs + WHERE status='merged' + AND merged_at >= ? + AND merged_at <= datetime(?, '+24 hours') + AND (branch LIKE 'leo/%' OR branch LIKE 'theseus/%' + OR branch LIKE 'rio/%' OR branch LIKE 'astra/%' + OR branch LIKE 'vida/%' OR branch LIKE 'clay/%') + ORDER BY merged_at ASC LIMIT 1""", + (commit_date, commit_date), + ).fetchone() + if row: + return row["number"], "git_time_proximity" + + return None, "none" + + def emit(conn, counts, dry_run, handle, role, pr_number, claim_path, domain, channel, timestamp): canonical = normalize_handle(conn, handle) if not valid_handle(canonical): @@ -349,13 +511,15 @@ def main(): print(f" {role:12s} attempted={att:5d} inserted={ins:5d} skipped_dup={skip:5d}") # ── Per-claim originator pass ── - # Separate pass: walk the current knowledge tree, parse sourcer frontmatter, - # and attach each claim to the merging PR via a claim_path → pr_number map - # built from prs.description (pipe-separated claim titles). Imperfect — some - # PRs have NULL description or mismatched titles — but recovers the bulk of - # historical originator credit. + # Walk the knowledge tree, parse sourcer attribution, and attach each claim + # to its merging PR via find_pr_for_claim's multi-strategy recovery. + # Apr 24 rewrite (Ganymede-approved): replaces the single-strategy + # title→description match with four strategies in reliability order. + # Previous script missed PRs with NULL description (Cameron #3377) and + # cross-context claims (Shaga's Leo research). Fallback title-match is + # preserved to recover anything the git-log path misses. print("\n=== Claim-level originator pass ===") - # Build title → pr_number map from prs.description + # Build title → pr_number map from prs.description (strategy 3 fallback) title_to_pr: dict[str, int] = {} for r in conn.execute( "SELECT number, description FROM prs WHERE status='merged' AND description IS NOT NULL AND description != ''" @@ -368,6 +532,7 @@ def main(): title_to_pr[title.lower()] = r["number"] claim_counts = Counter() + strategy_counts = Counter() claim_count = 0 originator_count = 0 for md in sorted(repo.glob("domains/**/*.md")) + \ @@ -375,13 +540,19 @@ def main(): sorted(repo.glob("foundations/**/*.md")) + \ sorted(repo.glob("decisions/**/*.md")): rel = str(md.relative_to(repo)) - # Match via filename stem (with spaces and hyphens) against description titles stem = md.stem - # Multiple matching strategies - pr_number = title_to_pr.get(stem.lower()) + + # Strategies 1, 2, 4 via the helper (sourced_from, git_subject, github_pr). + pr_number, strategy = find_pr_for_claim(conn, repo, md) + + # Strategy 3 (fallback): title-match against prs.description. if not pr_number: - # Hyphenated slug → space variant - pr_number = title_to_pr.get(stem.replace("-", " ").lower()) + pr_number = title_to_pr.get(stem.lower()) + if not pr_number: + pr_number = title_to_pr.get(stem.replace("-", " ").lower()) + if pr_number: + strategy = "title_desc" + if not pr_number: claim_counts["no_pr_match"] += 1 continue @@ -392,6 +563,7 @@ def main(): continue claim_count += 1 + strategy_counts[strategy] += 1 # Look up author for this PR to skip self-credit pr_row = conn.execute( "SELECT submitted_by, branch, domain, source_channel, merged_at FROM prs WHERE number = ?", @@ -420,6 +592,7 @@ def main(): print(f" Claims processed: {claim_count}") print(f" Originator events emitted: {originator_count}") print(f" Breakdown: {dict(claim_counts)}") + print(f" Strategy hits: {dict(strategy_counts)}") att = counts[("originator", "attempt")] if args.dry_run: wi = counts[("originator", "would_insert")]