teleo-infrastructure/lib/contributor.py

"""Contributor attribution — tracks who contributed what and calculates tiers.

Extracted from merge.py (Phase 5 decomposition). Functions:
- is_knowledge_pr: diff classification (knowledge vs pipeline-only)
- refine_commit_type: extract → challenge/enrich refinement from diff content
- record_contributor_attribution: parse trailers + frontmatter, upsert contributors
- upsert_contributor: insert/update contributor record with role counts
- recalculate_tier: tier promotion based on config rules
"""

import json
import logging
import re

from . import config, db
from .forgejo import get_pr_diff

logger = logging.getLogger("pipeline.contributor")


def is_knowledge_pr(diff: str) -> bool:
    """Check if a PR touches knowledge files (claims, decisions, core, foundations).

    Knowledge PRs get full CI attribution weight.
    Pipeline-only PRs (inbox, entities, agents, archive) get zero CI weight.

    Mixed PRs count as knowledge — if a PR adds a claim, it gets attribution
    even if it also moves source files. Knowledge takes priority. (Ganymede review)
    """
    knowledge_prefixes = ("domains/", "core/", "foundations/", "decisions/")

    for line in diff.split("\n"):
        if line.startswith("+++ b/") or line.startswith("--- a/"):
            path = line.split("/", 1)[1] if "/" in line else ""
            if any(path.startswith(p) for p in knowledge_prefixes):
                return True

    return False


def refine_commit_type(diff: str, branch_commit_type: str) -> str:
    """Refine commit_type from diff content when branch prefix is ambiguous.

    Branch prefix gives initial classification (extract, research, entity, etc.).
    For 'extract' branches, diff content can distinguish:
    - challenge: adds challenged_by edges to existing claims
    - enrich: modifies existing claim frontmatter without new files
    - extract: creates new claim files (default for extract branches)

    Only refines 'extract' type — other branch types (research, entity, reweave, fix)
    are already specific enough.
    """
    if branch_commit_type != "extract":
        return branch_commit_type

    new_files = 0
    modified_files = 0
    has_challenge_edge = False

    in_diff_header = False
    current_is_new = False
    for line in diff.split("\n"):
        if line.startswith("diff --git"):
            in_diff_header = True
            current_is_new = False
        elif line.startswith("new file"):
            current_is_new = True
        elif line.startswith("+++ b/"):
            path = line[6:]
            if any(path.startswith(p) for p in ("domains/", "core/", "foundations/")):
                if current_is_new:
                    new_files += 1
                else:
                    modified_files += 1
            in_diff_header = False
        elif line.startswith("+") and not line.startswith("+++"):
            if "challenged_by:" in line or "challenges:" in line:
                has_challenge_edge = True

    if has_challenge_edge and new_files == 0:
        return "challenge"
    if modified_files > 0 and new_files == 0:
        return "enrich"
    return "extract"


async def record_contributor_attribution(conn, pr_number: int, branch: str, git_fn):
    """Record contributor attribution after a successful merge.

    Parses git trailers and claim frontmatter to identify contributors
    and their roles. Upserts into contributors table. Refines commit_type
    from diff content. Pipeline-only PRs (no knowledge files) are skipped.

    Args:
        git_fn: async callable matching _git signature (for git log parsing).
    """
    from datetime import date as _date

    today = _date.today().isoformat()

    # Get the PR diff to parse claim frontmatter for attribution blocks
    diff = await get_pr_diff(pr_number)
    if not diff:
        return

    # Pipeline-only PRs (inbox, entities, agents) don't count toward CI
    if not is_knowledge_pr(diff):
        logger.info("PR #%d: pipeline-only commit — skipping CI attribution", pr_number)
        return

    # Refine commit_type from diff content (branch prefix may be too broad)
    row = conn.execute("SELECT commit_type FROM prs WHERE number = ?", (pr_number,)).fetchone()
    branch_type = row["commit_type"] if row and row["commit_type"] else "extract"
    refined_type = refine_commit_type(diff, branch_type)
    if refined_type != branch_type:
        conn.execute("UPDATE prs SET commit_type = ? WHERE number = ?", (refined_type, pr_number))
        logger.info("PR #%d: commit_type refined %s → %s", pr_number, branch_type, refined_type)

    # Parse Pentagon-Agent trailer from branch commit messages
    agents_found: set[str] = set()
    rc, log_output = await git_fn(
        "log", f"origin/main..origin/{branch}", "--format=%b%n%N",
        timeout=10,
    )
    if rc == 0:
        for match in re.finditer(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", log_output):
            agent_name = match.group(1).lower()
            agent_uuid = match.group(2)
            upsert_contributor(
                conn, agent_name, agent_uuid, "extractor", today,
            )
            agents_found.add(agent_name)

    # Parse attribution blocks from claim frontmatter in diff
    # Look for added lines with attribution YAML
    current_role = None
    for line in diff.split("\n"):
        if not line.startswith("+") or line.startswith("+++"):
            continue
        stripped = line[1:].strip()

        # Detect role sections in attribution block
        for role in ("sourcer", "extractor", "challenger", "synthesizer", "reviewer"):
            if stripped.startswith(f"{role}:"):
                current_role = role
                break

        # Extract handle from attribution entries
        handle_match = re.match(r'-\s*handle:\s*["\']?([^"\']+)["\']?', stripped)
        if handle_match and current_role:
            handle = handle_match.group(1).strip().lower()
            agent_id_match = re.search(r'agent_id:\s*["\']?([^"\']+)', stripped)
            agent_id = agent_id_match.group(1).strip() if agent_id_match else None
            upsert_contributor(conn, handle, agent_id, current_role, today)

    # Fallback: if no attribution block found, credit the branch agent as extractor
    if not agents_found:
        row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone()
        if row and row["agent"]:
            upsert_contributor(conn, row["agent"].lower(), None, "extractor", today)


def upsert_contributor(
    conn, handle: str, agent_id: str | None, role: str, date_str: str,
):
    """Upsert a contributor record, incrementing the appropriate role count."""
    role_col = f"{role}_count"
    if role_col not in (
        "sourcer_count", "extractor_count", "challenger_count",
        "synthesizer_count", "reviewer_count",
    ):
        logger.warning("Unknown contributor role: %s", role)
        return

    existing = conn.execute(
        "SELECT handle FROM contributors WHERE handle = ?", (handle,)
    ).fetchone()

    if existing:
        conn.execute(
            f"""UPDATE contributors SET
                {role_col} = {role_col} + 1,
                claims_merged = claims_merged + CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END,
                last_contribution = ?,
                updated_at = datetime('now')
            WHERE handle = ?""",
            (role, date_str, handle),
        )
    else:
        conn.execute(
            f"""INSERT INTO contributors (handle, agent_id, first_contribution, last_contribution, {role_col}, claims_merged)
            VALUES (?, ?, ?, ?, 1, CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END)""",
            (handle, agent_id, date_str, date_str, role),
        )

    # Recalculate tier
    recalculate_tier(conn, handle)


def recalculate_tier(conn, handle: str):
    """Recalculate contributor tier based on config rules."""
    from datetime import date as _date, datetime as _dt

    row = conn.execute(
        "SELECT claims_merged, challenges_survived, first_contribution, tier FROM contributors WHERE handle = ?",
        (handle,),
    ).fetchone()
    if not row:
        return

    current_tier = row["tier"]
    claims_merged = row["claims_merged"] or 0
    challenges_survived = row["challenges_survived"] or 0
    first_contribution = row["first_contribution"]

    days_since_first = 0
    if first_contribution:
        try:
            first_date = _dt.strptime(first_contribution, "%Y-%m-%d").date()
            days_since_first = (_date.today() - first_date).days
        except ValueError:
            pass

    # Check veteran first (higher tier)
    vet_rules = config.CONTRIBUTOR_TIER_RULES["veteran"]
    if (claims_merged >= vet_rules["claims_merged"]
            and days_since_first >= vet_rules["min_days_since_first"]
            and challenges_survived >= vet_rules["challenges_survived"]):
        new_tier = "veteran"
    elif claims_merged >= config.CONTRIBUTOR_TIER_RULES["contributor"]["claims_merged"]:
        new_tier = "contributor"
    else:
        new_tier = "new"

    if new_tier != current_tier:
        conn.execute(
            "UPDATE contributors SET tier = ?, updated_at = datetime('now') WHERE handle = ?",
            (new_tier, handle),
        )
        logger.info("Contributor %s: tier %s → %s", handle, current_tier, new_tier)
        db.audit(
            conn, "contributor", "tier_change",
            json.dumps({"handle": handle, "from": current_tier, "to": new_tier}),
        )