"""Attribution module — shared between post_extract.py and merge.py. Owns: parsing attribution from YAML frontmatter, validating role entries, computing role counts for contributor upserts, building attribution blocks. Avoids circular dependency between post_extract.py (validates attribution at extraction time) and merge.py (records attribution at merge time). Both import from this shared module. Schema reference: schemas/attribution.md Weights reference: schemas/contribution-weights.yaml Epimetheus owns this module. Leo reviews changes. """ import logging import re from pathlib import Path logger = logging.getLogger("pipeline.attribution") VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"}) # ─── Parse attribution from claim content ────────────────────────────────── def parse_attribution(fm: dict) -> dict[str, list[dict]]: """Extract attribution block from claim frontmatter. Returns {role: [{"handle": str, "agent_id": str|None, "context": str|None}]} Handles both nested YAML format and flat field format. """ result = {role: [] for role in VALID_ROLES} attribution = fm.get("attribution") if isinstance(attribution, dict): # Nested format (from schema spec) for role in VALID_ROLES: entries = attribution.get(role, []) if isinstance(entries, list): for entry in entries: if isinstance(entry, dict) and "handle" in entry: result[role].append({ "handle": entry["handle"].strip().lower().lstrip("@"), "agent_id": entry.get("agent_id"), "context": entry.get("context"), }) elif isinstance(entry, str): result[role].append({"handle": entry.strip().lower().lstrip("@"), "agent_id": None, "context": None}) elif isinstance(entries, str): # Single entry as string result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None}) return result # Flat format fallback (attribution_sourcer, attribution_extractor, etc.) for role in VALID_ROLES: flat_val = fm.get(f"attribution_{role}") if flat_val: if isinstance(flat_val, str): result[role].append({"handle": flat_val.strip().lower().lstrip("@"), "agent_id": None, "context": None}) elif isinstance(flat_val, list): for v in flat_val: if isinstance(v, str): result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None}) # Legacy fallback: infer from source field if not any(result[r] for r in VALID_ROLES): source = fm.get("source", "") if isinstance(source, str) and source: # Try to extract author handle from source string # Patterns: "@handle", "Author Name", "org, description" handle_match = re.search(r"@(\w+)", source) if handle_match: result["sourcer"].append({"handle": handle_match.group(1).lower(), "agent_id": None, "context": source}) else: # Use first word/phrase before comma as sourcer handle author = source.split(",")[0].strip().lower().replace(" ", "-") if author and len(author) > 1: result["sourcer"].append({"handle": author, "agent_id": None, "context": source}) return result def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]: """Read a claim file and extract attribution. Returns role→entries dict.""" try: content = Path(filepath).read_text() except (FileNotFoundError, PermissionError): return {role: [] for role in VALID_ROLES} from .post_extract import parse_frontmatter fm, _ = parse_frontmatter(content) if fm is None: return {role: [] for role in VALID_ROLES} return parse_attribution(fm) # ─── Validate attribution ────────────────────────────────────────────────── def validate_attribution(fm: dict, agent: str | None = None) -> list[str]: """Validate attribution block in claim frontmatter. Returns list of issues. Block on missing extractor, warn on missing sourcer. (Leo: extractor is always known, sourcer is best-effort.) If agent is provided and extractor is missing, auto-fix by setting the agent as extractor (same pattern as created-date auto-fix). Only validates if an attribution block is explicitly present. Legacy claims without attribution blocks are not blocked — they'll get attribution when enriched. New claims from v2 extraction always have attribution. """ issues = [] # Only validate if attribution block exists (don't break legacy claims) has_attribution = ( fm.get("attribution") is not None or any(fm.get(f"attribution_{role}") for role in VALID_ROLES) ) if not has_attribution: return [] # No attribution block = legacy claim, not an error attribution = parse_attribution(fm) if not attribution["extractor"]: if agent: # Auto-fix: set the processing agent as extractor attr = fm.get("attribution") if isinstance(attr, dict): attr["extractor"] = [{"handle": agent}] else: fm["attribution"] = {"extractor": [{"handle": agent}]} issues.append("fixed_missing_extractor") else: issues.append("missing_attribution_extractor") return issues # ─── Build attribution block ────────────────────────────────────────────── def build_attribution_block( agent: str, agent_id: str | None = None, source_handle: str | None = None, source_context: str | None = None, ) -> dict: """Build an attribution dict for a newly extracted claim. Called by openrouter-extract-v2.py when reconstructing claim content. """ attribution = { "extractor": [{"handle": agent}], "sourcer": [], "challenger": [], "synthesizer": [], "reviewer": [], } if agent_id: attribution["extractor"][0]["agent_id"] = agent_id if source_handle: entry = {"handle": source_handle.strip().lower().lstrip("@")} if source_context: entry["context"] = source_context attribution["sourcer"].append(entry) return attribution # ─── Compute role counts for contributor upserts ────────────────────────── def role_counts_from_attribution(attribution: dict[str, list[dict]]) -> dict[str, list[str]]: """Extract {role: [handle, ...]} for contributor table upserts. Returns a dict mapping each role to the list of contributor handles. Used by merge.py to credit contributors after merge. """ counts: dict[str, list[str]] = {} for role in VALID_ROLES: handles = [entry["handle"] for entry in attribution.get(role, []) if entry.get("handle")] if handles: counts[role] = handles return counts