Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Companion / write-side fix to fix/activity-feed-canonical-handle. The activity-feed canonicalization was a read-side guard. The bug at the source is that extract.py and two backfill scripts write decorated strings (Vida (self-directed), pipeline (reweave), @m3taversal) into prs.submitted_by and sources.submitted_by. Downstream readers (lib.contributor.insert_contribution_event, scripts/scoring_digest, diagnostics/activity_feed_api) all strip the decorator on read — but anything that reads the column verbatim (like /api/activity-feed before the read-side fix) 404s on /contributors/{decorated-handle}. Stop writing the decorator. The self-directed signal is already carried by intake_tier == research-task plus the prs.agent column; the suffix is redundant string noise that costs us correctness at every consumer that forgets to strip. Changes: - lib/extract.py:690 — write canonical handle via attribution.normalize_handle. Direct elif for intake_tier == research-task now stores just agent_name. @m3taversal -> m3taversal. - diagnostics/backfill_submitted_by.py — same fix in two branches plus the reweave branch (pipeline (reweave) -> pipeline). - scripts/backfill-research-session-attribution.py — UPDATE prs sets agent handle alone, no suffix. Docstring + log line updated. - scripts/normalize-submitted-by.py (new) — one-time backfill that canonicalizes existing prs.submitted_by and sources.submitted_by rows. Strips trailing parenthetical decorators, lowercases, drops @. Defaults to dry-run; --apply to commit. Skips rows that would normalize to invalid handles (no garbage falls through silently). Dry-run against live pipeline.db: prs: 3008 rows need normalization (clean mappings, 0 invalid) sources: 730 rows need normalization (clean mappings, 0 invalid) Total: 3738 rows. All map to existing handle column values. After this lands + auto-deploys, the operator should run python3 scripts/normalize-submitted-by.py --apply once to clean historical rows. The read-side canonicalization in diagnostics/activity_feed_api.py (fix/activity-feed-canonical-handle) becomes redundant defense-in-depth instead of load-bearing. No KB writes.
143 lines
5.1 KiB
Python
143 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""One-time backfill: populate submitted_by on prs table from source archive files.
|
|
|
|
Matches PRs to sources via branch name slug → source filename.
|
|
Reads proposed_by and intake_tier from source frontmatter.
|
|
|
|
Run: python3 backfill_submitted_by.py
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
DB_PATH = os.environ.get("DB_PATH", "/opt/teleo-eval/pipeline/pipeline.db")
|
|
ARCHIVE_DIR = Path(os.environ.get("ARCHIVE_DIR", "/opt/teleo-eval/workspaces/main/inbox/archive"))
|
|
|
|
|
|
def parse_frontmatter(path: Path) -> dict:
|
|
"""Parse YAML-like frontmatter from a markdown file."""
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
if not text.startswith("---"):
|
|
return {}
|
|
end = text.find("---", 3)
|
|
if end == -1:
|
|
return {}
|
|
fm = {}
|
|
for line in text[3:end].strip().split("\n"):
|
|
line = line.strip()
|
|
if not line or ":" not in line:
|
|
continue
|
|
key, _, val = line.partition(":")
|
|
key = key.strip()
|
|
val = val.strip().strip('"').strip("'")
|
|
if val.lower() == "null" or val == "":
|
|
val = None
|
|
fm[key] = val
|
|
return fm
|
|
|
|
|
|
def slug_from_branch(branch: str) -> str:
|
|
"""Extract source slug from branch name like 'extract/2026-04-06-slug-hash'."""
|
|
if "/" in branch:
|
|
branch = branch.split("/", 1)[1]
|
|
# Strip trailing hex hash (e.g., -3e68, -a6af)
|
|
branch = re.sub(r"-[0-9a-f]{4}$", "", branch)
|
|
return branch
|
|
|
|
|
|
def main():
|
|
conn = sqlite3.connect(DB_PATH, timeout=30)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Build source index: filename stem → frontmatter
|
|
source_index = {}
|
|
if ARCHIVE_DIR.exists():
|
|
for f in ARCHIVE_DIR.glob("*.md"):
|
|
fm = parse_frontmatter(f)
|
|
source_index[f.stem] = fm
|
|
print(f"Indexed {len(source_index)} source files from {ARCHIVE_DIR}")
|
|
|
|
# Get all PRs without submitted_by
|
|
prs = conn.execute(
|
|
"SELECT number, branch FROM prs WHERE submitted_by IS NULL AND branch IS NOT NULL"
|
|
).fetchall()
|
|
print(f"Found {len(prs)} PRs without submitted_by")
|
|
|
|
updated = 0
|
|
for pr in prs:
|
|
branch = pr["branch"]
|
|
slug = slug_from_branch(branch)
|
|
|
|
# Try to match slug to a source file
|
|
fm = source_index.get(slug)
|
|
if not fm:
|
|
# Try partial matching: slug might be a substring of the source filename
|
|
for stem, sfm in source_index.items():
|
|
if slug in stem or stem in slug:
|
|
fm = sfm
|
|
break
|
|
|
|
# `submitted_by` is stored as a canonical handle (lowercase, no @, no
|
|
# "(self-directed)" / "(reweave)" suffix). Read consumers normalize via
|
|
# attribution.normalize_handle, so writing decorated strings produces
|
|
# downstream 404s on /contributors/{handle} (livingip-web timeline).
|
|
if fm:
|
|
proposed_by = fm.get("proposed_by")
|
|
intake_tier = fm.get("intake_tier")
|
|
|
|
if proposed_by:
|
|
contributor = proposed_by.strip().strip('"').strip("'").lower().lstrip("@")
|
|
elif intake_tier == "research-task":
|
|
# Derive agent from branch prefix
|
|
prefix = branch.split("/", 1)[0] if "/" in branch else "unknown"
|
|
agent_map = {
|
|
"extract": "pipeline", "ingestion": "pipeline",
|
|
"rio": "rio", "theseus": "theseus", "vida": "vida",
|
|
"clay": "clay", "astra": "astra", "leo": "leo",
|
|
"reweave": "pipeline",
|
|
}
|
|
contributor = agent_map.get(prefix, prefix)
|
|
elif intake_tier == "directed":
|
|
contributor = "m3taversal"
|
|
else:
|
|
# Default: if source exists but no proposed_by, operator submitted it.
|
|
contributor = "m3taversal"
|
|
|
|
if contributor:
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = ?, source_path = ? WHERE number = ?",
|
|
(contributor, f"inbox/archive/{slug}.md", pr["number"]),
|
|
)
|
|
updated += 1
|
|
else:
|
|
# Agent-named branches from overnight research sessions
|
|
if branch.startswith(("rio/", "theseus/", "vida/", "clay/", "astra/", "leo/")):
|
|
agent = branch.split("/", 1)[0]
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = ? WHERE number = ?",
|
|
(agent, pr["number"]),
|
|
)
|
|
updated += 1
|
|
elif branch.startswith("reweave/"):
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = 'pipeline' WHERE number = ?",
|
|
(pr["number"],),
|
|
)
|
|
updated += 1
|
|
else:
|
|
# Everything else (extract/, ingestion/, unknown) → operator directed it
|
|
conn.execute(
|
|
"UPDATE prs SET submitted_by = 'm3taversal' WHERE number = ?",
|
|
(pr["number"],),
|
|
)
|
|
updated += 1
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"Updated {updated}/{len(prs)} PRs with submitted_by")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|