Some checks failed
CI / lint-and-test (pull_request) Has been cancelled
Atomic extract-and-connect (lib/connect.py): - After extraction writes claim files, each new claim is embedded via OpenRouter, searched against Qdrant, and top-5 neighbors (cosine > 0.55) are added as `related` edges in the claim's frontmatter - Edges written on NEW claim only — avoids merge conflicts - Cross-domain connections enabled, non-fatal on Qdrant failure - Wired into openrouter-extract-v2.py post-extraction step Stale PR monitor (lib/stale_pr.py): - Every watchdog cycle checks open extract/* PRs - If open >30 min AND 0 claim files → auto-close with comment - After 2 stale closures → marks source as extraction_failed - Wired into watchdog.py as check #6 Response audit system: - response_audit table (migration v8), persistent audit conn in bot.py - 90-day retention cleanup, tool_calls JSON column - Confidence tag stripping, systemd ReadWritePaths for pipeline.db Supporting infrastructure: - reweave.py: nightly edge reconnection for orphan claims - reconcile-sources.py: source status reconciliation - backfill-domains.py: domain classification backfill - ops/reconcile-source-status.sh: operational reconciliation script - Attribution improvements, post-extract enrichments, merge improvements Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
190 lines
7.4 KiB
Python
190 lines
7.4 KiB
Python
"""Attribution module — shared between post_extract.py and merge.py.
|
|
|
|
Owns: parsing attribution from YAML frontmatter, validating role entries,
|
|
computing role counts for contributor upserts, building attribution blocks.
|
|
|
|
Avoids circular dependency between post_extract.py (validates attribution at
|
|
extraction time) and merge.py (records attribution at merge time). Both
|
|
import from this shared module.
|
|
|
|
Schema reference: schemas/attribution.md
|
|
Weights reference: schemas/contribution-weights.yaml
|
|
|
|
Epimetheus owns this module. Leo reviews changes.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger("pipeline.attribution")
|
|
|
|
VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"})
|
|
|
|
|
|
# ─── Parse attribution from claim content ──────────────────────────────────
|
|
|
|
|
|
def parse_attribution(fm: dict) -> dict[str, list[dict]]:
|
|
"""Extract attribution block from claim frontmatter.
|
|
|
|
Returns {role: [{"handle": str, "agent_id": str|None, "context": str|None}]}
|
|
Handles both nested YAML format and flat field format.
|
|
"""
|
|
result = {role: [] for role in VALID_ROLES}
|
|
|
|
attribution = fm.get("attribution")
|
|
if isinstance(attribution, dict):
|
|
# Nested format (from schema spec)
|
|
for role in VALID_ROLES:
|
|
entries = attribution.get(role, [])
|
|
if isinstance(entries, list):
|
|
for entry in entries:
|
|
if isinstance(entry, dict) and "handle" in entry:
|
|
result[role].append({
|
|
"handle": entry["handle"].strip().lower().lstrip("@"),
|
|
"agent_id": entry.get("agent_id"),
|
|
"context": entry.get("context"),
|
|
})
|
|
elif isinstance(entry, str):
|
|
result[role].append({"handle": entry.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
|
elif isinstance(entries, str):
|
|
# Single entry as string
|
|
result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
|
return result
|
|
|
|
# Flat format fallback (attribution_sourcer, attribution_extractor, etc.)
|
|
for role in VALID_ROLES:
|
|
flat_val = fm.get(f"attribution_{role}")
|
|
if flat_val:
|
|
if isinstance(flat_val, str):
|
|
result[role].append({"handle": flat_val.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
|
elif isinstance(flat_val, list):
|
|
for v in flat_val:
|
|
if isinstance(v, str):
|
|
result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
|
|
|
|
# Legacy fallback: infer from source field
|
|
if not any(result[r] for r in VALID_ROLES):
|
|
source = fm.get("source", "")
|
|
if isinstance(source, str) and source:
|
|
# Try to extract author handle from source string
|
|
# Patterns: "@handle", "Author Name", "org, description"
|
|
handle_match = re.search(r"@(\w+)", source)
|
|
if handle_match:
|
|
result["sourcer"].append({"handle": handle_match.group(1).lower(), "agent_id": None, "context": source})
|
|
else:
|
|
# Use first word/phrase before comma as sourcer handle
|
|
author = source.split(",")[0].strip().lower().replace(" ", "-")
|
|
if author and len(author) > 1:
|
|
result["sourcer"].append({"handle": author, "agent_id": None, "context": source})
|
|
|
|
return result
|
|
|
|
|
|
def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]:
|
|
"""Read a claim file and extract attribution. Returns role→entries dict."""
|
|
try:
|
|
content = Path(filepath).read_text()
|
|
except (FileNotFoundError, PermissionError):
|
|
return {role: [] for role in VALID_ROLES}
|
|
|
|
from .post_extract import parse_frontmatter
|
|
fm, _ = parse_frontmatter(content)
|
|
if fm is None:
|
|
return {role: [] for role in VALID_ROLES}
|
|
|
|
return parse_attribution(fm)
|
|
|
|
|
|
# ─── Validate attribution ──────────────────────────────────────────────────
|
|
|
|
|
|
def validate_attribution(fm: dict, agent: str | None = None) -> list[str]:
|
|
"""Validate attribution block in claim frontmatter.
|
|
|
|
Returns list of issues. Block on missing extractor, warn on missing sourcer.
|
|
(Leo: extractor is always known, sourcer is best-effort.)
|
|
|
|
If agent is provided and extractor is missing, auto-fix by setting the
|
|
agent as extractor (same pattern as created-date auto-fix).
|
|
|
|
Only validates if an attribution block is explicitly present. Legacy claims
|
|
without attribution blocks are not blocked — they'll get attribution when
|
|
enriched. New claims from v2 extraction always have attribution.
|
|
"""
|
|
issues = []
|
|
|
|
# Only validate if attribution block exists (don't break legacy claims)
|
|
has_attribution = (
|
|
fm.get("attribution") is not None
|
|
or any(fm.get(f"attribution_{role}") for role in VALID_ROLES)
|
|
)
|
|
if not has_attribution:
|
|
return [] # No attribution block = legacy claim, not an error
|
|
|
|
attribution = parse_attribution(fm)
|
|
|
|
if not attribution["extractor"]:
|
|
if agent:
|
|
# Auto-fix: set the processing agent as extractor
|
|
attr = fm.get("attribution")
|
|
if isinstance(attr, dict):
|
|
attr["extractor"] = [{"handle": agent}]
|
|
else:
|
|
fm["attribution"] = {"extractor": [{"handle": agent}]}
|
|
issues.append("fixed_missing_extractor")
|
|
else:
|
|
issues.append("missing_attribution_extractor")
|
|
|
|
return issues
|
|
|
|
|
|
# ─── Build attribution block ──────────────────────────────────────────────
|
|
|
|
|
|
def build_attribution_block(
|
|
agent: str,
|
|
agent_id: str | None = None,
|
|
source_handle: str | None = None,
|
|
source_context: str | None = None,
|
|
) -> dict:
|
|
"""Build an attribution dict for a newly extracted claim.
|
|
|
|
Called by openrouter-extract-v2.py when reconstructing claim content.
|
|
"""
|
|
attribution = {
|
|
"extractor": [{"handle": agent}],
|
|
"sourcer": [],
|
|
"challenger": [],
|
|
"synthesizer": [],
|
|
"reviewer": [],
|
|
}
|
|
|
|
if agent_id:
|
|
attribution["extractor"][0]["agent_id"] = agent_id
|
|
|
|
if source_handle:
|
|
entry = {"handle": source_handle.strip().lower().lstrip("@")}
|
|
if source_context:
|
|
entry["context"] = source_context
|
|
attribution["sourcer"].append(entry)
|
|
|
|
return attribution
|
|
|
|
|
|
# ─── Compute role counts for contributor upserts ──────────────────────────
|
|
|
|
|
|
def role_counts_from_attribution(attribution: dict[str, list[dict]]) -> dict[str, list[str]]:
|
|
"""Extract {role: [handle, ...]} for contributor table upserts.
|
|
|
|
Returns a dict mapping each role to the list of contributor handles.
|
|
Used by merge.py to credit contributors after merge.
|
|
"""
|
|
counts: dict[str, list[str]] = {}
|
|
for role in VALID_ROLES:
|
|
handles = [entry["handle"] for entry in attribution.get(role, []) if entry.get("handle")]
|
|
if handles:
|
|
counts[role] = handles
|
|
return counts
|