fix(attribution): credit research-session sources to agents, not m3taversal (#7)
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
Forward fix: research-session.sh writes intake_tier: research-task (no proposed_by — extract.py infers agent from branch). Backfill: 304 PRs reattributed across 30 days (rio 74, clay 70, astra 53, vida 48, theseus 30, leo 29). Already applied to production. Format reconciliation: normalize_handle strips (self-directed) suffix so both halves canonicalize to the same agent handle. 5 idempotency tests passing. Production replay self-extinguishes (delta 3839→3839).
This commit is contained in:
commit
369f6c96da
5 changed files with 502 additions and 1 deletions
|
|
@ -15,6 +15,7 @@ Epimetheus owns this module. Leo reviews changes.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import sqlite3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
logger = logging.getLogger("pipeline.attribution")
|
logger = logging.getLogger("pipeline.attribution")
|
||||||
|
|
@ -81,6 +82,7 @@ def normalize_handle(handle: str, conn=None) -> str:
|
||||||
if not handle:
|
if not handle:
|
||||||
return ""
|
return ""
|
||||||
h = handle.strip().lower().lstrip("@")
|
h = handle.strip().lower().lstrip("@")
|
||||||
|
h = re.sub(r"\s*\(self-directed\)\s*$", "", h)
|
||||||
if conn is None:
|
if conn is None:
|
||||||
return h
|
return h
|
||||||
try:
|
try:
|
||||||
|
|
@ -108,6 +110,36 @@ def classify_kind(handle: str) -> str:
|
||||||
return "person"
|
return "person"
|
||||||
|
|
||||||
|
|
||||||
|
def is_publisher_handle(handle: str, conn) -> int | None:
|
||||||
|
"""Return publisher.id if the handle exists as a publisher name, else None.
|
||||||
|
|
||||||
|
Schema v26 split orgs/citations into the publishers table. Writer code
|
||||||
|
(upsert_contributor, insert_contribution_event) calls this to gate creating
|
||||||
|
contributor rows or events for handles that belong to publishers.
|
||||||
|
|
||||||
|
Without this gate, every merged PR with `sourcer: cnbc` (for example) would
|
||||||
|
re-create CNBC as a contributor and undo the v26 classifier cleanup.
|
||||||
|
|
||||||
|
Falls back gracefully on pre-v26 DBs: returns None if publishers table
|
||||||
|
doesn't exist yet (writer behaves like before, no regression).
|
||||||
|
"""
|
||||||
|
if not handle or conn is None:
|
||||||
|
return None
|
||||||
|
h = handle.strip().lower().lstrip("@")
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id FROM publishers WHERE name = ?", (h,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return row["id"] if hasattr(row, "keys") else row[0]
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
# Pre-v26 DB: publishers table doesn't exist yet. Fall through to None
|
||||||
|
# so writer behaves as before. Any other exception class is real signal
|
||||||
|
# (programming error, lock contention, corruption) — let it propagate.
|
||||||
|
logger.debug("is_publisher_handle: publishers table not present (pre-v26?)", exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ─── Parse attribution from claim content ──────────────────────────────────
|
# ─── Parse attribution from claim content ──────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from . import config, db
|
from . import config, db
|
||||||
from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, normalize_handle
|
from .attribution import AGENT_BRANCH_PREFIXES, classify_kind, is_publisher_handle, normalize_handle
|
||||||
from .forgejo import get_pr_diff
|
from .forgejo import get_pr_diff
|
||||||
|
|
||||||
logger = logging.getLogger("pipeline.contributor")
|
logger = logging.getLogger("pipeline.contributor")
|
||||||
|
|
@ -62,6 +62,12 @@ def insert_contribution_event(
|
||||||
canonical = normalize_handle(handle, conn=conn)
|
canonical = normalize_handle(handle, conn=conn)
|
||||||
if not canonical:
|
if not canonical:
|
||||||
return False
|
return False
|
||||||
|
# Schema v26 gate: handles classified as publishers (CNBC, SpaceNews, arxiv,
|
||||||
|
# etc.) are provenance metadata, not contributors. Don't credit them. Without
|
||||||
|
# this gate every merge re-creates org events and undoes the v26 cleanup.
|
||||||
|
if is_publisher_handle(canonical, conn) is not None:
|
||||||
|
logger.debug("insert_contribution_event: %r is a publisher — skipping event", canonical)
|
||||||
|
return False
|
||||||
kind = classify_kind(canonical)
|
kind = classify_kind(canonical)
|
||||||
try:
|
try:
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
|
|
@ -419,6 +425,21 @@ def upsert_contributor(
|
||||||
logger.warning("Unknown contributor role: %s", role)
|
logger.warning("Unknown contributor role: %s", role)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Schema v26 gate: orgs/citations live in publishers table, not contributors.
|
||||||
|
# Skip without writing so the v26 classifier cleanup isn't undone by every
|
||||||
|
# merge that has `sourcer: cnbc` (or similar) in claim frontmatter.
|
||||||
|
#
|
||||||
|
# Note: bare normalization (lower + lstrip @), no alias resolution. This is
|
||||||
|
# consistent with the existing `SELECT handle FROM contributors WHERE handle = ?`
|
||||||
|
# below — both look up by canonical-form-as-stored. Today's classifier produces
|
||||||
|
# one publisher row per canonical handle, so bare lookup hits. Branch 3 will
|
||||||
|
# normalize alias→canonical at writer entry points (extract.py, post_extract);
|
||||||
|
# at that point this gate auto-tightens because callers pass canonical handles.
|
||||||
|
canonical_handle = handle.strip().lower().lstrip("@") if handle else ""
|
||||||
|
if canonical_handle and is_publisher_handle(canonical_handle, conn) is not None:
|
||||||
|
logger.debug("upsert_contributor: %r is a publisher — skipping contributor row", canonical_handle)
|
||||||
|
return
|
||||||
|
|
||||||
existing = conn.execute(
|
existing = conn.execute(
|
||||||
"SELECT handle FROM contributors WHERE handle = ?", (handle,)
|
"SELECT handle FROM contributors WHERE handle = ?", (handle,)
|
||||||
).fetchone()
|
).fetchone()
|
||||||
|
|
|
||||||
|
|
@ -267,6 +267,7 @@ format: tweet | thread
|
||||||
status: unprocessed
|
status: unprocessed
|
||||||
priority: high | medium | low
|
priority: high | medium | low
|
||||||
tags: [topic1, topic2]
|
tags: [topic1, topic2]
|
||||||
|
intake_tier: research-task
|
||||||
---
|
---
|
||||||
|
|
||||||
## Content
|
## Content
|
||||||
|
|
|
||||||
280
scripts/backfill-research-session-attribution.py
Normal file
280
scripts/backfill-research-session-attribution.py
Normal file
|
|
@ -0,0 +1,280 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Backfill: re-attribute research-session-derived PRs from m3taversal to agent.
|
||||||
|
|
||||||
|
Problem: research-session.sh used to write source frontmatter without
|
||||||
|
`proposed_by` / `intake_tier`, so extract.py's contributor-classification
|
||||||
|
fallback set `prs.submitted_by = '@m3taversal'`, which propagated into
|
||||||
|
`contribution_events` as a `handle='m3taversal', role='author'` row per
|
||||||
|
research-derived claim. Result: agent research credited to the human.
|
||||||
|
|
||||||
|
Forward fix is a frontmatter-template patch to research-session.sh.
|
||||||
|
This script corrects historical records.
|
||||||
|
|
||||||
|
Identification:
|
||||||
|
Research-session source archives are committed to teleo-codex with a
|
||||||
|
message matching `^<agent>: research session YYYY-MM-DD —`. The diff
|
||||||
|
for that commit lists `inbox/queue/*.md` files the agent created. Any
|
||||||
|
PR whose `source_path` matches one of those filenames is research-derived.
|
||||||
|
|
||||||
|
Touch list (per matched PR):
|
||||||
|
1. UPDATE prs SET submitted_by = '<agent> (self-directed)'
|
||||||
|
2. DELETE FROM contribution_events
|
||||||
|
WHERE handle='m3taversal' AND role='author' AND pr_number=?
|
||||||
|
3. INSERT OR IGNORE INTO contribution_events with handle=<agent>,
|
||||||
|
kind='agent', role='author', weight=0.30, original timestamp/domain/channel.
|
||||||
|
|
||||||
|
Defaults to --dry-run. Pass --apply to commit changes.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 backfill-research-session-attribution.py --dry-run --days 30
|
||||||
|
python3 backfill-research-session-attribution.py --apply --days 30
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
|
logger = logging.getLogger("backfill-research-attr")
|
||||||
|
|
||||||
|
DEFAULT_REPO = Path(os.environ.get("REPO_DIR", "/opt/teleo-eval/workspaces/main"))
|
||||||
|
DEFAULT_DB = Path(os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db"))
|
||||||
|
|
||||||
|
KNOWN_AGENTS = frozenset({"rio", "leo", "theseus", "vida", "clay", "astra"})
|
||||||
|
COMMIT_HEADER_RE = re.compile(r"^([a-z]+):\s+research session\s+\d{4}-\d{2}-\d{2}\s+—")
|
||||||
|
AUTHOR_WEIGHT = 0.30
|
||||||
|
|
||||||
|
|
||||||
|
def git(repo: Path, *args: str) -> str:
|
||||||
|
"""Run a git command in repo, return stdout. Raises on non-zero."""
|
||||||
|
result = subprocess.run(
|
||||||
|
["git", "-C", str(repo), *args],
|
||||||
|
capture_output=True, text=True, check=True,
|
||||||
|
)
|
||||||
|
return result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
def discover_research_session_archives(repo: Path, days: int) -> dict[str, str]:
|
||||||
|
"""Return {source_filename_basename: agent_handle} for last N days.
|
||||||
|
|
||||||
|
Walks teleo-codex `git log --since`, filters to research-session commits,
|
||||||
|
parses agent from message header, lists inbox/queue/*.md files added in
|
||||||
|
that commit's diff. Maps the basename (which becomes source_path on extract)
|
||||||
|
to the agent who created it.
|
||||||
|
"""
|
||||||
|
log = git(repo, "log", f"--since={days} days ago", "--pretty=%H|%s", "--no-merges")
|
||||||
|
file_to_agent: dict[str, str] = {}
|
||||||
|
commits_seen = 0
|
||||||
|
commits_matched = 0
|
||||||
|
for line in log.splitlines():
|
||||||
|
if not line or "|" not in line:
|
||||||
|
continue
|
||||||
|
commits_seen += 1
|
||||||
|
sha, _, subject = line.partition("|")
|
||||||
|
m = COMMIT_HEADER_RE.match(subject)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
agent = m.group(1)
|
||||||
|
if agent not in KNOWN_AGENTS:
|
||||||
|
logger.debug("skipping commit %s — unknown agent %r", sha[:8], agent)
|
||||||
|
continue
|
||||||
|
commits_matched += 1
|
||||||
|
# List files added in this commit (inbox/queue/*.md only)
|
||||||
|
try:
|
||||||
|
added = git(repo, "diff-tree", "--no-commit-id", "--name-only", "-r",
|
||||||
|
"--diff-filter=A", sha)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
logger.warning("diff-tree failed for %s", sha[:8])
|
||||||
|
continue
|
||||||
|
for f in added.splitlines():
|
||||||
|
if f.startswith("inbox/queue/") and f.endswith(".md"):
|
||||||
|
basename = Path(f).name
|
||||||
|
if basename in file_to_agent and file_to_agent[basename] != agent:
|
||||||
|
logger.warning(
|
||||||
|
"filename collision: %s — was %s, now %s (keeping first)",
|
||||||
|
basename, file_to_agent[basename], agent,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
file_to_agent.setdefault(basename, agent)
|
||||||
|
logger.info(
|
||||||
|
"scanned %d commits, %d research-session matches, %d unique source files",
|
||||||
|
commits_seen, commits_matched, len(file_to_agent),
|
||||||
|
)
|
||||||
|
return file_to_agent
|
||||||
|
|
||||||
|
|
||||||
|
def find_misattributed_prs(conn: sqlite3.Connection, file_to_agent: dict[str, str], days: int):
|
||||||
|
"""Return list of (pr_number, current_submitted_by, source_path, agent, domain, channel, merged_at).
|
||||||
|
|
||||||
|
Only includes PRs:
|
||||||
|
- with source_path basename in our research-session map
|
||||||
|
- currently attributed to '@m3taversal'
|
||||||
|
- merged within the last N days (cap on temporal scope)
|
||||||
|
"""
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT number, submitted_by, source_path, domain, source_channel, merged_at
|
||||||
|
FROM prs
|
||||||
|
WHERE submitted_by = '@m3taversal'
|
||||||
|
AND source_path IS NOT NULL
|
||||||
|
AND status = 'merged'
|
||||||
|
AND merged_at > datetime('now', ?)""",
|
||||||
|
(f"-{days} days",),
|
||||||
|
).fetchall()
|
||||||
|
matches = []
|
||||||
|
for row in rows:
|
||||||
|
basename = Path(row["source_path"]).name
|
||||||
|
agent = file_to_agent.get(basename)
|
||||||
|
if agent:
|
||||||
|
matches.append({
|
||||||
|
"pr": row["number"],
|
||||||
|
"current_submitted_by": row["submitted_by"],
|
||||||
|
"source_path": row["source_path"],
|
||||||
|
"basename": basename,
|
||||||
|
"agent": agent,
|
||||||
|
"domain": row["domain"],
|
||||||
|
"channel": row["source_channel"],
|
||||||
|
"merged_at": row["merged_at"],
|
||||||
|
})
|
||||||
|
return matches
|
||||||
|
|
||||||
|
|
||||||
|
def existing_event_count(conn: sqlite3.Connection, pr: int, handle: str, role: str) -> int:
|
||||||
|
"""Return count of contribution_events rows matching (handle, role, pr_number, claim_path IS NULL)."""
|
||||||
|
return conn.execute(
|
||||||
|
"""SELECT COUNT(*) FROM contribution_events
|
||||||
|
WHERE handle = ? AND role = ? AND pr_number = ? AND claim_path IS NULL""",
|
||||||
|
(handle, role, pr),
|
||||||
|
).fetchone()[0]
|
||||||
|
|
||||||
|
|
||||||
|
def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool) -> dict:
|
||||||
|
"""Apply the backfill. Returns counters."""
|
||||||
|
counters = defaultdict(int)
|
||||||
|
if not dry_run:
|
||||||
|
conn.execute("BEGIN")
|
||||||
|
try:
|
||||||
|
for m in matches:
|
||||||
|
pr = m["pr"]
|
||||||
|
agent = m["agent"]
|
||||||
|
|
||||||
|
# Pre-checks for accurate dry-run reporting
|
||||||
|
old_event_exists = existing_event_count(conn, pr, "m3taversal", "author") > 0
|
||||||
|
new_event_exists = existing_event_count(conn, pr, agent, "author") > 0
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
logger.info(
|
||||||
|
"would update pr=%d submitted_by '%s' → '%s (self-directed)' "
|
||||||
|
"[m3ta_event=%s, agent_event=%s]",
|
||||||
|
pr, m["current_submitted_by"], agent,
|
||||||
|
old_event_exists, new_event_exists,
|
||||||
|
)
|
||||||
|
counters["prs"] += 1
|
||||||
|
if old_event_exists:
|
||||||
|
counters["events_to_delete"] += 1
|
||||||
|
if not new_event_exists:
|
||||||
|
counters["events_to_insert"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 1. UPDATE prs.submitted_by
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE prs SET submitted_by = ? WHERE number = ?",
|
||||||
|
(f"{agent} (self-directed)", pr),
|
||||||
|
)
|
||||||
|
counters["prs"] += 1
|
||||||
|
|
||||||
|
# 2. INSERT new agent author event (idempotent via UNIQUE index)
|
||||||
|
cur = conn.execute(
|
||||||
|
"""INSERT OR IGNORE INTO contribution_events
|
||||||
|
(handle, kind, role, weight, pr_number, claim_path, domain, channel, timestamp)
|
||||||
|
VALUES (?, 'agent', 'author', ?, ?, NULL, ?, ?, COALESCE(?, datetime('now')))""",
|
||||||
|
(agent, AUTHOR_WEIGHT, pr, m["domain"], m["channel"], m["merged_at"]),
|
||||||
|
)
|
||||||
|
if cur.rowcount > 0:
|
||||||
|
counters["events_inserted"] += 1
|
||||||
|
|
||||||
|
# 3. DELETE old m3taversal author event
|
||||||
|
cur = conn.execute(
|
||||||
|
"""DELETE FROM contribution_events
|
||||||
|
WHERE handle = 'm3taversal' AND role = 'author'
|
||||||
|
AND pr_number = ? AND claim_path IS NULL""",
|
||||||
|
(pr,),
|
||||||
|
)
|
||||||
|
if cur.rowcount > 0:
|
||||||
|
counters["events_deleted"] += 1
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
conn.execute("COMMIT")
|
||||||
|
except Exception:
|
||||||
|
if not dry_run:
|
||||||
|
conn.execute("ROLLBACK")
|
||||||
|
raise
|
||||||
|
|
||||||
|
return dict(counters)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--repo", type=Path, default=DEFAULT_REPO)
|
||||||
|
parser.add_argument("--db", type=Path, default=DEFAULT_DB)
|
||||||
|
parser.add_argument("--days", type=int, default=30)
|
||||||
|
parser.add_argument("--apply", action="store_true", help="commit changes (default: dry-run)")
|
||||||
|
parser.add_argument("--limit", type=int, default=0,
|
||||||
|
help="cap PR updates (0 = no cap; useful for testing on a small slice)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
dry_run = not args.apply
|
||||||
|
|
||||||
|
logger.info("repo=%s db=%s days=%d mode=%s",
|
||||||
|
args.repo, args.db, args.days, "DRY-RUN" if dry_run else "APPLY")
|
||||||
|
|
||||||
|
if not args.repo.exists():
|
||||||
|
logger.error("repo not found: %s", args.repo)
|
||||||
|
sys.exit(1)
|
||||||
|
if not args.db.exists():
|
||||||
|
logger.error("db not found: %s", args.db)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_to_agent = discover_research_session_archives(args.repo, args.days)
|
||||||
|
if not file_to_agent:
|
||||||
|
logger.warning("no research-session source files found in last %d days", args.days)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Per-agent breakdown
|
||||||
|
by_agent = defaultdict(int)
|
||||||
|
for agent in file_to_agent.values():
|
||||||
|
by_agent[agent] += 1
|
||||||
|
for agent, count in sorted(by_agent.items()):
|
||||||
|
logger.info(" research-session sources by %s: %d", agent, count)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(args.db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
matches = find_misattributed_prs(conn, file_to_agent, args.days)
|
||||||
|
logger.info("misattributed PRs found: %d", len(matches))
|
||||||
|
|
||||||
|
if args.limit and len(matches) > args.limit:
|
||||||
|
logger.info("--limit=%d — truncating from %d", args.limit, len(matches))
|
||||||
|
matches = matches[:args.limit]
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
logger.info("nothing to do")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Per-agent breakdown of misattribution
|
||||||
|
miss_by_agent = defaultdict(int)
|
||||||
|
for m in matches:
|
||||||
|
miss_by_agent[m["agent"]] += 1
|
||||||
|
logger.info("misattributed PR breakdown:")
|
||||||
|
for agent, count in sorted(miss_by_agent.items()):
|
||||||
|
logger.info(" %s: %d", agent, count)
|
||||||
|
|
||||||
|
counters = apply_backfill(conn, matches, dry_run)
|
||||||
|
logger.info("RESULT (%s): %s", "DRY-RUN" if dry_run else "APPLIED", counters)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
167
tests/test_research_backfill_idempotent.py
Normal file
167
tests/test_research_backfill_idempotent.py
Normal file
|
|
@ -0,0 +1,167 @@
|
||||||
|
"""Verify research-attribution backfill is replay-safe against real schema.
|
||||||
|
|
||||||
|
Three things to prove:
|
||||||
|
1. (handle, role, pr_number) with claim_path=NULL deduplicates correctly
|
||||||
|
(idx_ce_unique_pr partial index handles SQLite NULL-not-equal-NULL).
|
||||||
|
2. Re-inserting an existing (handle, role, pr_number, NULL) row via INSERT OR IGNORE
|
||||||
|
is a true no-op — does not create a phantom duplicate.
|
||||||
|
3. The backfill script's specific operation (DELETE then INSERT for same key)
|
||||||
|
nets zero rows when run twice in sequence.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Schema lifted verbatim from lib/db.py:181-209
|
||||||
|
SCHEMA = """
|
||||||
|
CREATE TABLE contribution_events (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
handle TEXT NOT NULL,
|
||||||
|
kind TEXT NOT NULL DEFAULT 'person',
|
||||||
|
role TEXT NOT NULL,
|
||||||
|
weight REAL NOT NULL,
|
||||||
|
pr_number INTEGER NOT NULL,
|
||||||
|
claim_path TEXT,
|
||||||
|
domain TEXT,
|
||||||
|
channel TEXT,
|
||||||
|
timestamp TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
CREATE UNIQUE INDEX idx_ce_unique_claim ON contribution_events(
|
||||||
|
handle, role, pr_number, claim_path
|
||||||
|
) WHERE claim_path IS NOT NULL;
|
||||||
|
CREATE UNIQUE INDEX idx_ce_unique_pr ON contribution_events(
|
||||||
|
handle, role, pr_number
|
||||||
|
) WHERE claim_path IS NULL;
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def setup() -> sqlite3.Connection:
|
||||||
|
conn = sqlite3.connect(":memory:")
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
conn.executescript(SCHEMA)
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def insert_event(conn, handle, role, pr_number, claim_path=None):
|
||||||
|
cur = conn.execute(
|
||||||
|
"""INSERT OR IGNORE INTO contribution_events
|
||||||
|
(handle, kind, role, weight, pr_number, claim_path)
|
||||||
|
VALUES (?, 'agent', ?, 0.30, ?, ?)""",
|
||||||
|
(handle, role, pr_number, claim_path),
|
||||||
|
)
|
||||||
|
return cur.rowcount
|
||||||
|
|
||||||
|
|
||||||
|
def count(conn) -> int:
|
||||||
|
return conn.execute("SELECT COUNT(*) FROM contribution_events").fetchone()[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_pr_level_dedup_with_null_claim_path():
|
||||||
|
"""Two inserts of same (handle, role, pr_number, NULL) → 1 row."""
|
||||||
|
conn = setup()
|
||||||
|
r1 = insert_event(conn, "rio", "author", 4061)
|
||||||
|
r2 = insert_event(conn, "rio", "author", 4061)
|
||||||
|
n = count(conn)
|
||||||
|
assert r1 == 1, f"first insert should write, got rowcount={r1}"
|
||||||
|
assert r2 == 0, f"second insert should be ignored, got rowcount={r2}"
|
||||||
|
assert n == 1, f"expected 1 row, got {n}"
|
||||||
|
print("PASS: pr-level dedup with NULL claim_path")
|
||||||
|
|
||||||
|
|
||||||
|
def test_per_claim_dedup_with_path():
|
||||||
|
"""Two inserts of same (handle, role, pr_number, path) → 1 row."""
|
||||||
|
conn = setup()
|
||||||
|
r1 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md")
|
||||||
|
r2 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md")
|
||||||
|
n = count(conn)
|
||||||
|
assert r1 == 1 and r2 == 0 and n == 1
|
||||||
|
print("PASS: per-claim dedup with claim_path")
|
||||||
|
|
||||||
|
|
||||||
|
def test_pr_level_and_per_claim_coexist():
|
||||||
|
"""A (handle, role, pr_number, NULL) and (handle, role, pr_number, 'x.md') coexist
|
||||||
|
because the partial indexes target different rows."""
|
||||||
|
conn = setup()
|
||||||
|
r1 = insert_event(conn, "rio", "author", 4061, claim_path=None)
|
||||||
|
r2 = insert_event(conn, "rio", "author", 4061, claim_path="domains/x.md")
|
||||||
|
n = count(conn)
|
||||||
|
assert r1 == 1 and r2 == 1 and n == 2
|
||||||
|
print("PASS: pr-level and per-claim events coexist on same pr_number")
|
||||||
|
|
||||||
|
|
||||||
|
def test_backfill_replay_is_noop():
|
||||||
|
"""Simulate the exact backfill operation: INSERT correct event, DELETE wrong event.
|
||||||
|
Run twice. Expect identical state — no phantom rows, no double-deletions."""
|
||||||
|
conn = setup()
|
||||||
|
|
||||||
|
# Initial state: m3taversal has the wrong author event for pr=4061
|
||||||
|
insert_event(conn, "m3taversal", "author", 4061)
|
||||||
|
assert count(conn) == 1
|
||||||
|
|
||||||
|
def backfill_pr_4061():
|
||||||
|
# Insert the correct event (rio is the real author)
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT OR IGNORE INTO contribution_events
|
||||||
|
(handle, kind, role, weight, pr_number, claim_path)
|
||||||
|
VALUES (?, 'agent', 'author', 0.30, 4061, NULL)""",
|
||||||
|
("rio (self-directed)",),
|
||||||
|
)
|
||||||
|
# Delete the wrong event
|
||||||
|
conn.execute(
|
||||||
|
"""DELETE FROM contribution_events
|
||||||
|
WHERE handle='m3taversal' AND role='author'
|
||||||
|
AND pr_number=4061 AND claim_path IS NULL""",
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
backfill_pr_4061()
|
||||||
|
state_after_first = sorted(
|
||||||
|
(r["handle"], r["role"], r["pr_number"], r["claim_path"])
|
||||||
|
for r in conn.execute("SELECT * FROM contribution_events")
|
||||||
|
)
|
||||||
|
assert state_after_first == [("rio (self-directed)", "author", 4061, None)], state_after_first
|
||||||
|
|
||||||
|
# Replay
|
||||||
|
backfill_pr_4061()
|
||||||
|
state_after_second = sorted(
|
||||||
|
(r["handle"], r["role"], r["pr_number"], r["claim_path"])
|
||||||
|
for r in conn.execute("SELECT * FROM contribution_events")
|
||||||
|
)
|
||||||
|
assert state_after_first == state_after_second, "replay should be idempotent"
|
||||||
|
assert count(conn) == 1, f"expected 1 row after replay, got {count(conn)}"
|
||||||
|
print("PASS: backfill replay is a true no-op")
|
||||||
|
|
||||||
|
|
||||||
|
def test_replay_against_already_backfilled_pr_does_not_double_delete():
|
||||||
|
"""If m3taversal event was already deleted, running backfill again must not error
|
||||||
|
or affect anything else."""
|
||||||
|
conn = setup()
|
||||||
|
# Already-correct state: rio has the author event, m3taversal does not
|
||||||
|
insert_event(conn, "rio (self-directed)", "author", 4061)
|
||||||
|
insert_event(conn, "leo", "evaluator", 4061) # noise — should not be touched
|
||||||
|
|
||||||
|
# Run backfill: tries to INSERT (rio, author, 4061) — already exists, no-op
|
||||||
|
# Tries to DELETE (m3taversal, author, 4061) — already absent, 0 rows affected
|
||||||
|
cur1 = conn.execute(
|
||||||
|
"""INSERT OR IGNORE INTO contribution_events
|
||||||
|
(handle, kind, role, weight, pr_number, claim_path)
|
||||||
|
VALUES ('rio (self-directed)', 'agent', 'author', 0.30, 4061, NULL)""",
|
||||||
|
)
|
||||||
|
cur2 = conn.execute(
|
||||||
|
"""DELETE FROM contribution_events
|
||||||
|
WHERE handle='m3taversal' AND role='author'
|
||||||
|
AND pr_number=4061 AND claim_path IS NULL""",
|
||||||
|
)
|
||||||
|
assert cur1.rowcount == 0, f"insert should be no-op, got {cur1.rowcount}"
|
||||||
|
assert cur2.rowcount == 0, f"delete should be no-op, got {cur2.rowcount}"
|
||||||
|
assert count(conn) == 2, f"expected 2 rows preserved, got {count(conn)}"
|
||||||
|
print("PASS: replay against already-backfilled state preserves unrelated events")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_pr_level_dedup_with_null_claim_path()
|
||||||
|
test_per_claim_dedup_with_path()
|
||||||
|
test_pr_level_and_per_claim_coexist()
|
||||||
|
test_backfill_replay_is_noop()
|
||||||
|
test_replay_against_already_backfilled_pr_does_not_double_delete()
|
||||||
|
print("\nAll 5 tests passed against real schema.")
|
||||||
Loading…
Reference in a new issue