fix(attribution): canonicalize submitted_by at write time + historical normalizer
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Companion / write-side fix to fix/activity-feed-canonical-handle. The activity-feed canonicalization was a read-side guard. The bug at the source is that extract.py and two backfill scripts write decorated strings (Vida (self-directed), pipeline (reweave), @m3taversal) into prs.submitted_by and sources.submitted_by. Downstream readers (lib.contributor.insert_contribution_event, scripts/scoring_digest, diagnostics/activity_feed_api) all strip the decorator on read — but anything that reads the column verbatim (like /api/activity-feed before the read-side fix) 404s on /contributors/{decorated-handle}. Stop writing the decorator. The self-directed signal is already carried by intake_tier == research-task plus the prs.agent column; the suffix is redundant string noise that costs us correctness at every consumer that forgets to strip. Changes: - lib/extract.py:690 — write canonical handle via attribution.normalize_handle. Direct elif for intake_tier == research-task now stores just agent_name. @m3taversal -> m3taversal. - diagnostics/backfill_submitted_by.py — same fix in two branches plus the reweave branch (pipeline (reweave) -> pipeline). - scripts/backfill-research-session-attribution.py — UPDATE prs sets agent handle alone, no suffix. Docstring + log line updated. - scripts/normalize-submitted-by.py (new) — one-time backfill that canonicalizes existing prs.submitted_by and sources.submitted_by rows. Strips trailing parenthetical decorators, lowercases, drops @. Defaults to dry-run; --apply to commit. Skips rows that would normalize to invalid handles (no garbage falls through silently). Dry-run against live pipeline.db: prs: 3008 rows need normalization (clean mappings, 0 invalid) sources: 730 rows need normalization (clean mappings, 0 invalid) Total: 3738 rows. All map to existing handle column values. After this lands + auto-deploys, the operator should run python3 scripts/normalize-submitted-by.py --apply once to clean historical rows. The read-side canonicalization in diagnostics/activity_feed_api.py (fix/activity-feed-canonical-handle) becomes redundant defense-in-depth instead of load-bearing. No KB writes.
This commit is contained in:
parent
6c66da33e4
commit
74bf0461e8
4 changed files with 149 additions and 20 deletions
|
|
@ -79,12 +79,16 @@ def main():
|
||||||
fm = sfm
|
fm = sfm
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# `submitted_by` is stored as a canonical handle (lowercase, no @, no
|
||||||
|
# "(self-directed)" / "(reweave)" suffix). Read consumers normalize via
|
||||||
|
# attribution.normalize_handle, so writing decorated strings produces
|
||||||
|
# downstream 404s on /contributors/{handle} (livingip-web timeline).
|
||||||
if fm:
|
if fm:
|
||||||
proposed_by = fm.get("proposed_by")
|
proposed_by = fm.get("proposed_by")
|
||||||
intake_tier = fm.get("intake_tier")
|
intake_tier = fm.get("intake_tier")
|
||||||
|
|
||||||
if proposed_by:
|
if proposed_by:
|
||||||
contributor = proposed_by.strip().strip('"').strip("'")
|
contributor = proposed_by.strip().strip('"').strip("'").lower().lstrip("@")
|
||||||
elif intake_tier == "research-task":
|
elif intake_tier == "research-task":
|
||||||
# Derive agent from branch prefix
|
# Derive agent from branch prefix
|
||||||
prefix = branch.split("/", 1)[0] if "/" in branch else "unknown"
|
prefix = branch.split("/", 1)[0] if "/" in branch else "unknown"
|
||||||
|
|
@ -94,13 +98,12 @@ def main():
|
||||||
"clay": "clay", "astra": "astra", "leo": "leo",
|
"clay": "clay", "astra": "astra", "leo": "leo",
|
||||||
"reweave": "pipeline",
|
"reweave": "pipeline",
|
||||||
}
|
}
|
||||||
agent = agent_map.get(prefix, prefix)
|
contributor = agent_map.get(prefix, prefix)
|
||||||
contributor = f"{agent} (self-directed)"
|
|
||||||
elif intake_tier == "directed":
|
elif intake_tier == "directed":
|
||||||
contributor = "@m3taversal"
|
contributor = "m3taversal"
|
||||||
else:
|
else:
|
||||||
# Default: if source exists but no proposed_by, it was Cory's submission
|
# Default: if source exists but no proposed_by, operator submitted it.
|
||||||
contributor = "@m3taversal"
|
contributor = "m3taversal"
|
||||||
|
|
||||||
if contributor:
|
if contributor:
|
||||||
conn.execute(
|
conn.execute(
|
||||||
|
|
@ -114,19 +117,19 @@ def main():
|
||||||
agent = branch.split("/", 1)[0]
|
agent = branch.split("/", 1)[0]
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"UPDATE prs SET submitted_by = ? WHERE number = ?",
|
"UPDATE prs SET submitted_by = ? WHERE number = ?",
|
||||||
(f"{agent} (self-directed)", pr["number"]),
|
(agent, pr["number"]),
|
||||||
)
|
)
|
||||||
updated += 1
|
updated += 1
|
||||||
elif branch.startswith("reweave/"):
|
elif branch.startswith("reweave/"):
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"UPDATE prs SET submitted_by = 'pipeline (reweave)' WHERE number = ?",
|
"UPDATE prs SET submitted_by = 'pipeline' WHERE number = ?",
|
||||||
(pr["number"],),
|
(pr["number"],),
|
||||||
)
|
)
|
||||||
updated += 1
|
updated += 1
|
||||||
else:
|
else:
|
||||||
# Everything else (extract/, ingestion/, unknown) → Cory directed it
|
# Everything else (extract/, ingestion/, unknown) → operator directed it
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"UPDATE prs SET submitted_by = '@m3taversal' WHERE number = ?",
|
"UPDATE prs SET submitted_by = 'm3taversal' WHERE number = ?",
|
||||||
(pr["number"],),
|
(pr["number"],),
|
||||||
)
|
)
|
||||||
updated += 1
|
updated += 1
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ from datetime import date
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from . import config
|
from . import config
|
||||||
|
from .attribution import normalize_handle
|
||||||
from .costs import record_usage
|
from .costs import record_usage
|
||||||
from .db import classify_source_channel
|
from .db import classify_source_channel
|
||||||
from .domains import agent_for_domain
|
from .domains import agent_for_domain
|
||||||
|
|
@ -683,16 +684,25 @@ async def _extract_one_source(
|
||||||
logger.info("PR #%d created for %s (%d claims, %d entities)", pr_num, source_file, len(claim_files), len(entity_files))
|
logger.info("PR #%d created for %s (%d claims, %d entities)", pr_num, source_file, len(claim_files), len(entity_files))
|
||||||
|
|
||||||
# Store contributor attribution: who submitted this source?
|
# Store contributor attribution: who submitted this source?
|
||||||
# Priority: proposed_by field → intake_tier inference → "unknown"
|
# Priority: proposed_by field → intake_tier inference → operator default.
|
||||||
|
# NB: `submitted_by` is a CANONICAL HANDLE — lowercase, no @, no
|
||||||
|
# trailing "(self-directed)" decorator. The "self-directed" signal is
|
||||||
|
# already carried by intake_tier == "research-task" + the prs.agent
|
||||||
|
# column; persisting it here as a string suffix produced decorated
|
||||||
|
# values like "Vida (self-directed)" that broke /contributors/{handle}
|
||||||
|
# lookups downstream (livingip-web timeline → 404). Read consumers
|
||||||
|
# (lib/contributor.insert_contribution_event, scripts/scoring_digest,
|
||||||
|
# diagnostics/activity_feed_api) all normalize via attribution.normalize_handle
|
||||||
|
# anyway, so writing the canonical form is the source-of-truth fix.
|
||||||
if proposed_by:
|
if proposed_by:
|
||||||
contributor = proposed_by.strip().strip('"').strip("'")
|
contributor = normalize_handle(proposed_by, conn=conn)
|
||||||
elif intake_tier == "research-task":
|
elif intake_tier == "research-task":
|
||||||
contributor = f"{agent_name} (self-directed)"
|
contributor = normalize_handle(agent_name, conn=conn)
|
||||||
elif intake_tier == "directed":
|
elif intake_tier == "directed":
|
||||||
contributor = "@m3taversal"
|
contributor = "m3taversal"
|
||||||
else:
|
else:
|
||||||
# Default: if no proposed_by and not a research task, Cory submitted it
|
# Default: if no proposed_by and not a research task, operator submitted it.
|
||||||
contributor = "@m3taversal"
|
contributor = "m3taversal"
|
||||||
|
|
||||||
# Build pipe-separated claim titles for the description field
|
# Build pipe-separated claim titles for the description field
|
||||||
claim_titles = " | ".join(
|
claim_titles = " | ".join(
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,9 @@ Identification:
|
||||||
PR whose `source_path` matches one of those filenames is research-derived.
|
PR whose `source_path` matches one of those filenames is research-derived.
|
||||||
|
|
||||||
Touch list (per matched PR):
|
Touch list (per matched PR):
|
||||||
1. UPDATE prs SET submitted_by = '<agent> (self-directed)'
|
1. UPDATE prs SET submitted_by = '<agent>' (canonical handle, lowercase, no
|
||||||
|
trailing "(self-directed)" suffix — see lib/extract.py and
|
||||||
|
diagnostics/activity_feed_api.py for why decorators leak into 404s)
|
||||||
2. DELETE FROM contribution_events
|
2. DELETE FROM contribution_events
|
||||||
WHERE handle='m3taversal' AND role='author' AND pr_number=?
|
WHERE handle='m3taversal' AND role='author' AND pr_number=?
|
||||||
3. INSERT OR IGNORE INTO contribution_events with handle=<agent>,
|
3. INSERT OR IGNORE INTO contribution_events with handle=<agent>,
|
||||||
|
|
@ -169,7 +171,7 @@ def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool)
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
logger.info(
|
logger.info(
|
||||||
"would update pr=%d submitted_by '%s' → '%s (self-directed)' "
|
"would update pr=%d submitted_by '%s' → '%s' "
|
||||||
"[m3ta_event=%s, agent_event=%s]",
|
"[m3ta_event=%s, agent_event=%s]",
|
||||||
pr, m["current_submitted_by"], agent,
|
pr, m["current_submitted_by"], agent,
|
||||||
old_event_exists, new_event_exists,
|
old_event_exists, new_event_exists,
|
||||||
|
|
@ -181,10 +183,10 @@ def apply_backfill(conn: sqlite3.Connection, matches: list[dict], dry_run: bool)
|
||||||
counters["events_to_insert"] += 1
|
counters["events_to_insert"] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 1. UPDATE prs.submitted_by
|
# 1. UPDATE prs.submitted_by — canonical handle only.
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"UPDATE prs SET submitted_by = ? WHERE number = ?",
|
"UPDATE prs SET submitted_by = ? WHERE number = ?",
|
||||||
(f"{agent} (self-directed)", pr),
|
(agent, pr),
|
||||||
)
|
)
|
||||||
counters["prs"] += 1
|
counters["prs"] += 1
|
||||||
|
|
||||||
|
|
|
||||||
114
scripts/normalize-submitted-by.py
Executable file
114
scripts/normalize-submitted-by.py
Executable file
|
|
@ -0,0 +1,114 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""One-time backfill: canonicalize prs.submitted_by and sources.submitted_by.
|
||||||
|
|
||||||
|
Strips legacy decorators ("(self-directed)", "(reweave)"), lowercases, drops
|
||||||
|
the @ prefix. After this runs, every value matches the contract documented
|
||||||
|
on diagnostics/activity_feed_api.py::_normalize_contributor — and the
|
||||||
|
companion read-side fix becomes redundant defense-in-depth instead of
|
||||||
|
load-bearing.
|
||||||
|
|
||||||
|
Defaults to --dry-run. Pass --apply to commit.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 normalize-submitted-by.py --dry-run
|
||||||
|
python3 normalize-submitted-by.py --apply
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
DEFAULT_DB = os.environ.get("PIPELINE_DB", "/opt/teleo-eval/pipeline/pipeline.db")
|
||||||
|
|
||||||
|
# Valid handle: lowercase alphanum + _-, 1-39 chars (matches GitHub rules,
|
||||||
|
# same as pipeline/lib/attribution._HANDLE_RE). Anything with parens, spaces,
|
||||||
|
# or uppercase needs canonicalization.
|
||||||
|
_TRAILING_PAREN_RE = re.compile(r"\s*\([^)]*\)\s*$")
|
||||||
|
_HANDLE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,38}$")
|
||||||
|
|
||||||
|
|
||||||
|
def canonicalize(raw):
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
h = raw.strip().lower().lstrip("@")
|
||||||
|
h = _TRAILING_PAREN_RE.sub("", h).strip()
|
||||||
|
return h or None
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_table(conn, table, dry_run):
|
||||||
|
cur = conn.execute(
|
||||||
|
f"SELECT rowid, submitted_by FROM {table} WHERE submitted_by IS NOT NULL"
|
||||||
|
)
|
||||||
|
changes = []
|
||||||
|
for row in cur.fetchall():
|
||||||
|
old = row[1]
|
||||||
|
new = canonicalize(old)
|
||||||
|
if new != old:
|
||||||
|
changes.append((row[0], old, new))
|
||||||
|
|
||||||
|
print(f"\n{table}: {len(changes)} rows need normalization")
|
||||||
|
if not changes:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Distribution preview
|
||||||
|
from_to = Counter((old, new) for _, old, new in changes)
|
||||||
|
for (old, new), count in from_to.most_common(15):
|
||||||
|
print(f" {count:>5} {old!r:40} -> {new!r}")
|
||||||
|
if len(from_to) > 15:
|
||||||
|
print(f" ... ({len(from_to) - 15} more distinct mappings)")
|
||||||
|
|
||||||
|
# Sanity: every result is a valid handle (no garbage falls through).
|
||||||
|
invalid = [(rowid, old, new) for rowid, old, new in changes
|
||||||
|
if new is not None and not _HANDLE_RE.match(new)]
|
||||||
|
if invalid:
|
||||||
|
print(f"\n WARNING: {len(invalid)} rows would normalize to invalid handles:")
|
||||||
|
for rowid, old, new in invalid[:10]:
|
||||||
|
print(f" rowid={rowid} {old!r} -> {new!r}")
|
||||||
|
print(" These rows will be SKIPPED (left as-is). Inspect manually.")
|
||||||
|
|
||||||
|
valid_changes = [(rowid, old, new) for rowid, old, new in changes
|
||||||
|
if new is None or _HANDLE_RE.match(new)]
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
print(f" [dry-run] would update {len(valid_changes)} rows in {table}")
|
||||||
|
return len(valid_changes)
|
||||||
|
|
||||||
|
for rowid, _, new in valid_changes:
|
||||||
|
conn.execute(
|
||||||
|
f"UPDATE {table} SET submitted_by = ? WHERE rowid = ?",
|
||||||
|
(new, rowid),
|
||||||
|
)
|
||||||
|
print(f" updated {len(valid_changes)} rows in {table}")
|
||||||
|
return len(valid_changes)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--db", default=DEFAULT_DB)
|
||||||
|
ap.add_argument("--apply", action="store_true", help="Commit changes (default is dry-run)")
|
||||||
|
ap.add_argument("--dry-run", action="store_true", help="Preview only (default)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
dry_run = not args.apply
|
||||||
|
print(f"DB: {args.db}")
|
||||||
|
print(f"Mode: {'DRY-RUN' if dry_run else 'APPLY'}")
|
||||||
|
|
||||||
|
conn = sqlite3.connect(args.db, timeout=30)
|
||||||
|
try:
|
||||||
|
total = 0
|
||||||
|
total += normalize_table(conn, "prs", dry_run)
|
||||||
|
total += normalize_table(conn, "sources", dry_run)
|
||||||
|
if not dry_run:
|
||||||
|
conn.commit()
|
||||||
|
print(f"\nCommitted. Total rows updated: {total}")
|
||||||
|
else:
|
||||||
|
print(f"\nDry-run complete. Run with --apply to commit ({total} rows pending).")
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue