teleo-infrastructure/lib/attribution.py
m3taversal d79ff60689 epimetheus: sync VPS-deployed code to repo — Mar 18-20 reliability + features
Pipeline reliability (8 fixes, reviewed by Ganymede+Rhea+Leo+Rio):
1. Merge API recovery — pre-flight approval check, transient/permanent distinction, jitter
2. Ghost PR detection — ls-remote branch check in reconciliation, network guard
3. Source status contract — directory IS status, no code change needed
4. Batch-state markers eliminated — two-gate skip (archive-check + batched branch-check)
5. Branch SHA tracking — batched ls-remote, auto-reset verdicts, dismiss stale reviews
6. Mirror pre-flight permissions — chown check in sync-mirror.sh
7. Telegram archive commit-after-write — git add/commit/push with rebase --abort fallback
8. Post-merge source archiving — queue/ → archive/{domain}/ after merge

Pipeline fixes:
- merge_cycled flag — eval attempts preserved during merge-failure cycling (Ganymede+Rhea)
- merge_failures diagnostic counter
- Startup recovery preserves eval_attempts (was incorrectly resetting to 0)
- No-diff PRs auto-closed by eval (root cause of 17 zombie PRs)
- GC threshold aligned with substantive fixer budget (was 2, now 4)
- Conflict retry with 3-attempt budget + permanent conflict handler
- Local ff-merge fallback for Forgejo 405 errors

Telegram bot:
- KB retrieval: 3-layer (entity resolution → claim search → agent context)
- Reply-to-bot handler (context.bot.id check)
- Tag regex: @teleo|@futairdbot
- Prompt rewrite for natural analyst voice
- Market data API integration (Ben's token price endpoint)
- Conversation windows (5-message unanswered counter, per-user-per-chat)
- Conversation history in prompt (last 5 exchanges)
- Worktree file lock for archive writes

Infrastructure:
- worktree_lock.py — file-based lock (flock) for main worktree coordination
- backfill-sources.py — source DB registration for Argus funnel
- batch-extract-50.sh v3 — two-gate skip, batched ls-remote, network guard
- sync-mirror.sh — auto-PR creation for mirrored GitHub branches, permission pre-flight
- Argus dashboard — conflicts + reviewing in backlog, queue count in funnel
- Enrichment-inside-frontmatter bug fix (regex anchor, not --- split)

Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-20 20:17:27 +00:00

178 lines
6.8 KiB
Python

"""Attribution module — shared between post_extract.py and merge.py.
Owns: parsing attribution from YAML frontmatter, validating role entries,
computing role counts for contributor upserts, building attribution blocks.
Avoids circular dependency between post_extract.py (validates attribution at
extraction time) and merge.py (records attribution at merge time). Both
import from this shared module.
Schema reference: schemas/attribution.md
Weights reference: schemas/contribution-weights.yaml
Epimetheus owns this module. Leo reviews changes.
"""
import logging
import re
from pathlib import Path
logger = logging.getLogger("pipeline.attribution")
VALID_ROLES = frozenset({"sourcer", "extractor", "challenger", "synthesizer", "reviewer"})
# ─── Parse attribution from claim content ──────────────────────────────────
def parse_attribution(fm: dict) -> dict[str, list[dict]]:
"""Extract attribution block from claim frontmatter.
Returns {role: [{"handle": str, "agent_id": str|None, "context": str|None}]}
Handles both nested YAML format and flat field format.
"""
result = {role: [] for role in VALID_ROLES}
attribution = fm.get("attribution")
if isinstance(attribution, dict):
# Nested format (from schema spec)
for role in VALID_ROLES:
entries = attribution.get(role, [])
if isinstance(entries, list):
for entry in entries:
if isinstance(entry, dict) and "handle" in entry:
result[role].append({
"handle": entry["handle"].strip().lower().lstrip("@"),
"agent_id": entry.get("agent_id"),
"context": entry.get("context"),
})
elif isinstance(entry, str):
result[role].append({"handle": entry.strip().lower().lstrip("@"), "agent_id": None, "context": None})
elif isinstance(entries, str):
# Single entry as string
result[role].append({"handle": entries.strip().lower().lstrip("@"), "agent_id": None, "context": None})
return result
# Flat format fallback (attribution_sourcer, attribution_extractor, etc.)
for role in VALID_ROLES:
flat_val = fm.get(f"attribution_{role}")
if flat_val:
if isinstance(flat_val, str):
result[role].append({"handle": flat_val.strip().lower().lstrip("@"), "agent_id": None, "context": None})
elif isinstance(flat_val, list):
for v in flat_val:
if isinstance(v, str):
result[role].append({"handle": v.strip().lower().lstrip("@"), "agent_id": None, "context": None})
# Legacy fallback: infer from source field
if not any(result[r] for r in VALID_ROLES):
source = fm.get("source", "")
if isinstance(source, str) and source:
# Try to extract author handle from source string
# Patterns: "@handle", "Author Name", "org, description"
handle_match = re.search(r"@(\w+)", source)
if handle_match:
result["sourcer"].append({"handle": handle_match.group(1).lower(), "agent_id": None, "context": source})
else:
# Use first word/phrase before comma as sourcer handle
author = source.split(",")[0].strip().lower().replace(" ", "-")
if author and len(author) > 1:
result["sourcer"].append({"handle": author, "agent_id": None, "context": source})
return result
def parse_attribution_from_file(filepath: str) -> dict[str, list[dict]]:
"""Read a claim file and extract attribution. Returns role→entries dict."""
try:
content = Path(filepath).read_text()
except (FileNotFoundError, PermissionError):
return {role: [] for role in VALID_ROLES}
from .post_extract import parse_frontmatter
fm, _ = parse_frontmatter(content)
if fm is None:
return {role: [] for role in VALID_ROLES}
return parse_attribution(fm)
# ─── Validate attribution ──────────────────────────────────────────────────
def validate_attribution(fm: dict) -> list[str]:
"""Validate attribution block in claim frontmatter.
Returns list of issues. Block on missing extractor, warn on missing sourcer.
(Leo: extractor is always known, sourcer is best-effort.)
Only validates if an attribution block is explicitly present. Legacy claims
without attribution blocks are not blocked — they'll get attribution when
enriched. New claims from v2 extraction always have attribution.
"""
issues = []
# Only validate if attribution block exists (don't break legacy claims)
has_attribution = (
fm.get("attribution") is not None
or any(fm.get(f"attribution_{role}") for role in VALID_ROLES)
)
if not has_attribution:
return [] # No attribution block = legacy claim, not an error
attribution = parse_attribution(fm)
if not attribution["extractor"]:
issues.append("missing_attribution_extractor")
return issues
# ─── Build attribution block ──────────────────────────────────────────────
def build_attribution_block(
agent: str,
agent_id: str | None = None,
source_handle: str | None = None,
source_context: str | None = None,
) -> dict:
"""Build an attribution dict for a newly extracted claim.
Called by openrouter-extract-v2.py when reconstructing claim content.
"""
attribution = {
"extractor": [{"handle": agent}],
"sourcer": [],
"challenger": [],
"synthesizer": [],
"reviewer": [],
}
if agent_id:
attribution["extractor"][0]["agent_id"] = agent_id
if source_handle:
entry = {"handle": source_handle.strip().lower().lstrip("@")}
if source_context:
entry["context"] = source_context
attribution["sourcer"].append(entry)
return attribution
# ─── Compute role counts for contributor upserts ──────────────────────────
def role_counts_from_attribution(attribution: dict[str, list[dict]]) -> dict[str, list[str]]:
"""Extract {role: [handle, ...]} for contributor table upserts.
Returns a dict mapping each role to the list of contributor handles.
Used by merge.py to credit contributors after merge.
"""
counts: dict[str, list[str]] = {}
for role in VALID_ROLES:
handles = [entry["handle"] for entry in attribution.get(role, []) if entry.get("handle")]
if handles:
counts[role] = handles
return counts