feat(backfill): 4-strategy PR recovery for originator events
Rewrite claim-level pass in backfill-events.py to recover the Forgejo PR that introduced each claim via a cascade of 4 strategies (reliability order), replacing the single title→description match that missed PRs with NULL description (Cameron #3377) and bare-subject extracts (Shaga's Leo research PR). ## Strategies 1. sourced_from frontmatter → prs.source_path stem match 2. git log first-add commit → subject pattern → prs.branch - "<agent>: extract claims from <slug>" → extract/<slug> - "<agent>: research session YYYY-MM-DD" → <agent>/research-<date> - "<agent>: (challenge|contrib|entity|synthesize)" → <agent>/* - "Recover X from GitHub PR #N" → prs.github_pr=N - "Extract N claims from X" (no prefix) → time-proximity on agent-owned branches within 24h 3. Current title_desc fallback for anything the above miss ## Dry-run projection (1,662 merged PRs) Before: Claims processed: 33 Originator events: 6 Breakdown: {no_pr_match: 1608, no_sourcer: 26, invalid_handle: 21, skip_self: 6} After: Claims processed: 505 (+472) Originator events: 126 (+120) Strategy hits: git_subject=412, sourced_from=88, git_time_proximity=5 Breakdown: {no_pr_match: 1095, no_sourcer: 67, invalid_handle: 359, skip_self: 20} ## Verified on real VPS data - @thesensatore claims: 3/5 resolve via git_time_proximity to leo/ PRs - Cameron-S1, alexastrum: remain None — their recovery commits (dba00a79, da64f805) bypassed the pipeline entirely, no Forgejo PR record exists. Requires synthetic prs rows — deferred to separate commit with its own Ganymede review (write operation, larger blast radius than this pure-read backfill change). ## Implementation - New find_pr_for_claim(conn, repo, md) helper returns (pr_number, strategy) - Claim-level pass uses it first, falls back to title_desc map - Strategy counter surfaced in summary output for operator visibility Idempotent — backfill re-runs skip duplicate events via the partial UNIQUE index on contribution_events. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
540ba97b9d
commit
1d6b51527a
1 changed files with 184 additions and 11 deletions
|
|
@ -199,6 +199,168 @@ def derive_author(conn: sqlite3.Connection, pr: dict) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_pr_for_claim(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
repo: Path,
|
||||||
|
md: Path,
|
||||||
|
) -> tuple[int | None, str]:
|
||||||
|
"""Recover the Forgejo PR number that introduced a claim file.
|
||||||
|
|
||||||
|
Returns (pr_number, strategy) — strategy is one of:
|
||||||
|
'sourced_from' — frontmatter sourced_from matched prs.source_path
|
||||||
|
'git_subject' — git log first-add commit message matched a branch pattern
|
||||||
|
'title_desc' — filename stem matched a title in prs.description
|
||||||
|
'github_pr' — recovery commit mentioned GitHub PR # → prs.github_pr
|
||||||
|
'none' — no strategy found a match
|
||||||
|
|
||||||
|
Order is chosen by reliability:
|
||||||
|
1. sourced_from (explicit provenance, most reliable when present)
|
||||||
|
2. git_subject (covers Leo research, Cameron challenges, Theseus contrib)
|
||||||
|
3. title_desc (current fallback — brittle when description is NULL)
|
||||||
|
4. github_pr (recovery commits referencing erased GitHub PRs)
|
||||||
|
"""
|
||||||
|
rel = str(md.relative_to(repo))
|
||||||
|
|
||||||
|
# Strategy 1: sourced_from frontmatter → prs.source_path
|
||||||
|
try:
|
||||||
|
content = md.read_text(encoding="utf-8")
|
||||||
|
except (FileNotFoundError, PermissionError, UnicodeDecodeError):
|
||||||
|
content = ""
|
||||||
|
fm = parse_frontmatter(content) if content else None
|
||||||
|
if fm:
|
||||||
|
sourced = fm.get("sourced_from")
|
||||||
|
candidate_paths: list[str] = []
|
||||||
|
if isinstance(sourced, str) and sourced:
|
||||||
|
candidate_paths.append(sourced)
|
||||||
|
elif isinstance(sourced, list):
|
||||||
|
candidate_paths.extend(s for s in sourced if isinstance(s, str))
|
||||||
|
for sp in candidate_paths:
|
||||||
|
stem = Path(sp).stem
|
||||||
|
if not stem:
|
||||||
|
continue
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT number FROM prs
|
||||||
|
WHERE source_path LIKE ? AND status='merged'
|
||||||
|
ORDER BY merged_at ASC LIMIT 1""",
|
||||||
|
(f"%{stem}.md",),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["number"], "sourced_from"
|
||||||
|
|
||||||
|
# Strategy 2: git log first-add commit → subject pattern → prs.branch
|
||||||
|
# Default log order is reverse-chronological; take the last line (oldest)
|
||||||
|
# to get the original addition, not later rewrites.
|
||||||
|
log_out = git(
|
||||||
|
"log", "--diff-filter=A", "--follow",
|
||||||
|
"--format=%H|||%s|||%b", "--", rel,
|
||||||
|
)
|
||||||
|
if log_out.strip():
|
||||||
|
# Split on the delimiter we chose. Each commit produces 3 fields but
|
||||||
|
# %b can contain blank lines — group by lines that look like a SHA.
|
||||||
|
blocks: list[tuple[str, str, str]] = []
|
||||||
|
current: list[str] = []
|
||||||
|
for line in log_out.splitlines():
|
||||||
|
if re.match(r"^[a-f0-9]{40}\|\|\|", line):
|
||||||
|
if current:
|
||||||
|
parts = "\n".join(current).split("|||", 2)
|
||||||
|
if len(parts) == 3:
|
||||||
|
blocks.append((parts[0], parts[1], parts[2]))
|
||||||
|
current = [line]
|
||||||
|
else:
|
||||||
|
current.append(line)
|
||||||
|
if current:
|
||||||
|
parts = "\n".join(current).split("|||", 2)
|
||||||
|
if len(parts) == 3:
|
||||||
|
blocks.append((parts[0], parts[1], parts[2]))
|
||||||
|
if blocks:
|
||||||
|
# Oldest addition — git log defaults to reverse-chronological
|
||||||
|
_oldest_sha, subject, body = blocks[-1]
|
||||||
|
|
||||||
|
# Pattern: "<agent>: extract claims from <slug>"
|
||||||
|
m = re.match(r"^(\w+):\s*extract\s+claims\s+from\s+(\S+)", subject)
|
||||||
|
if m:
|
||||||
|
slug = m.group(2).rstrip(".md").rstrip(".")
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT number FROM prs
|
||||||
|
WHERE branch LIKE ? AND status='merged'
|
||||||
|
ORDER BY merged_at ASC LIMIT 1""",
|
||||||
|
(f"extract/{slug}%",),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["number"], "git_subject"
|
||||||
|
|
||||||
|
# Pattern: "<agent>: research session <date>"
|
||||||
|
m = re.match(r"^(\w+):\s*research\s+session\s+(\d{4}-\d{2}-\d{2})", subject)
|
||||||
|
if m:
|
||||||
|
agent = m.group(1).lower()
|
||||||
|
date = m.group(2)
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT number FROM prs
|
||||||
|
WHERE branch LIKE ? AND status='merged'
|
||||||
|
ORDER BY merged_at ASC LIMIT 1""",
|
||||||
|
(f"{agent}/research-{date}%",),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["number"], "git_subject"
|
||||||
|
|
||||||
|
# Pattern: "<agent>: challenge" / contrib challenges / entity batches
|
||||||
|
m = re.match(r"^(\w+):\s*(?:challenge|contrib|entity|synthesize)", subject)
|
||||||
|
if m:
|
||||||
|
agent = m.group(1).lower()
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT number FROM prs
|
||||||
|
WHERE branch LIKE ? AND status='merged'
|
||||||
|
ORDER BY merged_at ASC LIMIT 1""",
|
||||||
|
(f"{agent}/%",),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["number"], "git_subject"
|
||||||
|
|
||||||
|
# Recovery commits referencing erased GitHub PRs (Alex/Cameron).
|
||||||
|
# Subject: "Recover <who> contribution from GitHub PR #NN (...)".
|
||||||
|
# Match only when a corresponding prs row exists with github_pr=NN —
|
||||||
|
# otherwise the claims were direct-to-main without a Forgejo PR
|
||||||
|
# record, which requires a synthetic PR row (follow-up, not in
|
||||||
|
# this script's scope).
|
||||||
|
gh_match = re.search(r"GitHub\s+PR\s+#(\d+)", subject + "\n" + body)
|
||||||
|
if gh_match:
|
||||||
|
gh_pr = int(gh_match.group(1))
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT number FROM prs WHERE github_pr = ? AND status='merged' LIMIT 1",
|
||||||
|
(gh_pr,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["number"], "github_pr"
|
||||||
|
|
||||||
|
# Pattern: bare "Extract N claims from <source-fragment>" (no
|
||||||
|
# agent prefix). Used in early research PRs like Shaga's claims
|
||||||
|
# at PR #2025. Fall back to time-proximity: find the earliest
|
||||||
|
# agent-branch PR merged within 24h AFTER this commit's date.
|
||||||
|
m = re.match(r"^Extract\s+\d+\s+claims\s+from\b", subject)
|
||||||
|
if m:
|
||||||
|
# Get commit author date
|
||||||
|
date_out = git(
|
||||||
|
"log", "-1", "--format=%aI", _oldest_sha, timeout=10,
|
||||||
|
)
|
||||||
|
commit_date = date_out.strip() if date_out.strip() else None
|
||||||
|
if commit_date:
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT number FROM prs
|
||||||
|
WHERE status='merged'
|
||||||
|
AND merged_at >= ?
|
||||||
|
AND merged_at <= datetime(?, '+24 hours')
|
||||||
|
AND (branch LIKE 'leo/%' OR branch LIKE 'theseus/%'
|
||||||
|
OR branch LIKE 'rio/%' OR branch LIKE 'astra/%'
|
||||||
|
OR branch LIKE 'vida/%' OR branch LIKE 'clay/%')
|
||||||
|
ORDER BY merged_at ASC LIMIT 1""",
|
||||||
|
(commit_date, commit_date),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["number"], "git_time_proximity"
|
||||||
|
|
||||||
|
return None, "none"
|
||||||
|
|
||||||
|
|
||||||
def emit(conn, counts, dry_run, handle, role, pr_number, claim_path, domain, channel, timestamp):
|
def emit(conn, counts, dry_run, handle, role, pr_number, claim_path, domain, channel, timestamp):
|
||||||
canonical = normalize_handle(conn, handle)
|
canonical = normalize_handle(conn, handle)
|
||||||
if not valid_handle(canonical):
|
if not valid_handle(canonical):
|
||||||
|
|
@ -349,13 +511,15 @@ def main():
|
||||||
print(f" {role:12s} attempted={att:5d} inserted={ins:5d} skipped_dup={skip:5d}")
|
print(f" {role:12s} attempted={att:5d} inserted={ins:5d} skipped_dup={skip:5d}")
|
||||||
|
|
||||||
# ── Per-claim originator pass ──
|
# ── Per-claim originator pass ──
|
||||||
# Separate pass: walk the current knowledge tree, parse sourcer frontmatter,
|
# Walk the knowledge tree, parse sourcer attribution, and attach each claim
|
||||||
# and attach each claim to the merging PR via a claim_path → pr_number map
|
# to its merging PR via find_pr_for_claim's multi-strategy recovery.
|
||||||
# built from prs.description (pipe-separated claim titles). Imperfect — some
|
# Apr 24 rewrite (Ganymede-approved): replaces the single-strategy
|
||||||
# PRs have NULL description or mismatched titles — but recovers the bulk of
|
# title→description match with four strategies in reliability order.
|
||||||
# historical originator credit.
|
# Previous script missed PRs with NULL description (Cameron #3377) and
|
||||||
|
# cross-context claims (Shaga's Leo research). Fallback title-match is
|
||||||
|
# preserved to recover anything the git-log path misses.
|
||||||
print("\n=== Claim-level originator pass ===")
|
print("\n=== Claim-level originator pass ===")
|
||||||
# Build title → pr_number map from prs.description
|
# Build title → pr_number map from prs.description (strategy 3 fallback)
|
||||||
title_to_pr: dict[str, int] = {}
|
title_to_pr: dict[str, int] = {}
|
||||||
for r in conn.execute(
|
for r in conn.execute(
|
||||||
"SELECT number, description FROM prs WHERE status='merged' AND description IS NOT NULL AND description != ''"
|
"SELECT number, description FROM prs WHERE status='merged' AND description IS NOT NULL AND description != ''"
|
||||||
|
|
@ -368,6 +532,7 @@ def main():
|
||||||
title_to_pr[title.lower()] = r["number"]
|
title_to_pr[title.lower()] = r["number"]
|
||||||
|
|
||||||
claim_counts = Counter()
|
claim_counts = Counter()
|
||||||
|
strategy_counts = Counter()
|
||||||
claim_count = 0
|
claim_count = 0
|
||||||
originator_count = 0
|
originator_count = 0
|
||||||
for md in sorted(repo.glob("domains/**/*.md")) + \
|
for md in sorted(repo.glob("domains/**/*.md")) + \
|
||||||
|
|
@ -375,13 +540,19 @@ def main():
|
||||||
sorted(repo.glob("foundations/**/*.md")) + \
|
sorted(repo.glob("foundations/**/*.md")) + \
|
||||||
sorted(repo.glob("decisions/**/*.md")):
|
sorted(repo.glob("decisions/**/*.md")):
|
||||||
rel = str(md.relative_to(repo))
|
rel = str(md.relative_to(repo))
|
||||||
# Match via filename stem (with spaces and hyphens) against description titles
|
|
||||||
stem = md.stem
|
stem = md.stem
|
||||||
# Multiple matching strategies
|
|
||||||
pr_number = title_to_pr.get(stem.lower())
|
# Strategies 1, 2, 4 via the helper (sourced_from, git_subject, github_pr).
|
||||||
|
pr_number, strategy = find_pr_for_claim(conn, repo, md)
|
||||||
|
|
||||||
|
# Strategy 3 (fallback): title-match against prs.description.
|
||||||
if not pr_number:
|
if not pr_number:
|
||||||
# Hyphenated slug → space variant
|
pr_number = title_to_pr.get(stem.lower())
|
||||||
pr_number = title_to_pr.get(stem.replace("-", " ").lower())
|
if not pr_number:
|
||||||
|
pr_number = title_to_pr.get(stem.replace("-", " ").lower())
|
||||||
|
if pr_number:
|
||||||
|
strategy = "title_desc"
|
||||||
|
|
||||||
if not pr_number:
|
if not pr_number:
|
||||||
claim_counts["no_pr_match"] += 1
|
claim_counts["no_pr_match"] += 1
|
||||||
continue
|
continue
|
||||||
|
|
@ -392,6 +563,7 @@ def main():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
claim_count += 1
|
claim_count += 1
|
||||||
|
strategy_counts[strategy] += 1
|
||||||
# Look up author for this PR to skip self-credit
|
# Look up author for this PR to skip self-credit
|
||||||
pr_row = conn.execute(
|
pr_row = conn.execute(
|
||||||
"SELECT submitted_by, branch, domain, source_channel, merged_at FROM prs WHERE number = ?",
|
"SELECT submitted_by, branch, domain, source_channel, merged_at FROM prs WHERE number = ?",
|
||||||
|
|
@ -420,6 +592,7 @@ def main():
|
||||||
print(f" Claims processed: {claim_count}")
|
print(f" Claims processed: {claim_count}")
|
||||||
print(f" Originator events emitted: {originator_count}")
|
print(f" Originator events emitted: {originator_count}")
|
||||||
print(f" Breakdown: {dict(claim_counts)}")
|
print(f" Breakdown: {dict(claim_counts)}")
|
||||||
|
print(f" Strategy hits: {dict(strategy_counts)}")
|
||||||
att = counts[("originator", "attempt")]
|
att = counts[("originator", "attempt")]
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
wi = counts[("originator", "would_insert")]
|
wi = counts[("originator", "would_insert")]
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue