"""Contributor attribution — tracks who contributed what and calculates tiers. Extracted from merge.py (Phase 5 decomposition). Functions: - is_knowledge_pr: diff classification (knowledge vs pipeline-only) - refine_commit_type: extract → challenge/enrich refinement from diff content - record_contributor_attribution: parse trailers + frontmatter, upsert contributors - upsert_contributor: insert/update contributor record with role counts - recalculate_tier: tier promotion based on config rules """ import json import logging import re from . import config, db from .forgejo import get_pr_diff logger = logging.getLogger("pipeline.contributor") def is_knowledge_pr(diff: str) -> bool: """Check if a PR touches knowledge files (claims, decisions, core, foundations). Knowledge PRs get full CI attribution weight. Pipeline-only PRs (inbox, entities, agents, archive) get zero CI weight. Mixed PRs count as knowledge — if a PR adds a claim, it gets attribution even if it also moves source files. Knowledge takes priority. (Ganymede review) """ knowledge_prefixes = ("domains/", "core/", "foundations/", "decisions/") for line in diff.split("\n"): if line.startswith("+++ b/") or line.startswith("--- a/"): path = line.split("/", 1)[1] if "/" in line else "" if any(path.startswith(p) for p in knowledge_prefixes): return True return False def refine_commit_type(diff: str, branch_commit_type: str) -> str: """Refine commit_type from diff content when branch prefix is ambiguous. Branch prefix gives initial classification (extract, research, entity, etc.). For 'extract' branches, diff content can distinguish: - challenge: adds challenged_by edges to existing claims - enrich: modifies existing claim frontmatter without new files - extract: creates new claim files (default for extract branches) Only refines 'extract' type — other branch types (research, entity, reweave, fix) are already specific enough. """ if branch_commit_type != "extract": return branch_commit_type new_files = 0 modified_files = 0 has_challenge_edge = False in_diff_header = False current_is_new = False for line in diff.split("\n"): if line.startswith("diff --git"): in_diff_header = True current_is_new = False elif line.startswith("new file"): current_is_new = True elif line.startswith("+++ b/"): path = line[6:] if any(path.startswith(p) for p in ("domains/", "core/", "foundations/")): if current_is_new: new_files += 1 else: modified_files += 1 in_diff_header = False elif line.startswith("+") and not line.startswith("+++"): if "challenged_by:" in line or "challenges:" in line: has_challenge_edge = True if has_challenge_edge and new_files == 0: return "challenge" if modified_files > 0 and new_files == 0: return "enrich" return "extract" async def record_contributor_attribution(conn, pr_number: int, branch: str, git_fn): """Record contributor attribution after a successful merge. Parses git trailers and claim frontmatter to identify contributors and their roles. Upserts into contributors table. Refines commit_type from diff content. Pipeline-only PRs (no knowledge files) are skipped. Args: git_fn: async callable matching _git signature (for git log parsing). """ from datetime import date as _date today = _date.today().isoformat() # Get the PR diff to parse claim frontmatter for attribution blocks diff = await get_pr_diff(pr_number) if not diff: return # Pipeline-only PRs (inbox, entities, agents) don't count toward CI if not is_knowledge_pr(diff): logger.info("PR #%d: pipeline-only commit — skipping CI attribution", pr_number) return # Refine commit_type from diff content (branch prefix may be too broad) row = conn.execute("SELECT commit_type FROM prs WHERE number = ?", (pr_number,)).fetchone() branch_type = row["commit_type"] if row and row["commit_type"] else "extract" refined_type = refine_commit_type(diff, branch_type) if refined_type != branch_type: conn.execute("UPDATE prs SET commit_type = ? WHERE number = ?", (refined_type, pr_number)) logger.info("PR #%d: commit_type refined %s → %s", pr_number, branch_type, refined_type) # Parse Pentagon-Agent trailer from branch commit messages agents_found: set[str] = set() rc, log_output = await git_fn( "log", f"origin/main..origin/{branch}", "--format=%b%n%N", timeout=10, ) if rc == 0: for match in re.finditer(r"Pentagon-Agent:\s*(\S+)\s*<([^>]+)>", log_output): agent_name = match.group(1).lower() agent_uuid = match.group(2) upsert_contributor( conn, agent_name, agent_uuid, "extractor", today, ) agents_found.add(agent_name) # Parse attribution blocks from claim frontmatter in diff # Look for added lines with attribution YAML current_role = None for line in diff.split("\n"): if not line.startswith("+") or line.startswith("+++"): continue stripped = line[1:].strip() # Detect role sections in attribution block for role in ("sourcer", "extractor", "challenger", "synthesizer", "reviewer"): if stripped.startswith(f"{role}:"): current_role = role break # Extract handle from attribution entries handle_match = re.match(r'-\s*handle:\s*["\']?([^"\']+)["\']?', stripped) if handle_match and current_role: handle = handle_match.group(1).strip().lower() agent_id_match = re.search(r'agent_id:\s*["\']?([^"\']+)', stripped) agent_id = agent_id_match.group(1).strip() if agent_id_match else None upsert_contributor(conn, handle, agent_id, current_role, today) # Fallback: if no attribution block found, credit the branch agent as extractor if not agents_found: row = conn.execute("SELECT agent FROM prs WHERE number = ?", (pr_number,)).fetchone() if row and row["agent"]: upsert_contributor(conn, row["agent"].lower(), None, "extractor", today) def upsert_contributor( conn, handle: str, agent_id: str | None, role: str, date_str: str, ): """Upsert a contributor record, incrementing the appropriate role count.""" role_col = f"{role}_count" if role_col not in ( "sourcer_count", "extractor_count", "challenger_count", "synthesizer_count", "reviewer_count", ): logger.warning("Unknown contributor role: %s", role) return existing = conn.execute( "SELECT handle FROM contributors WHERE handle = ?", (handle,) ).fetchone() if existing: conn.execute( f"""UPDATE contributors SET {role_col} = {role_col} + 1, claims_merged = claims_merged + CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END, last_contribution = ?, updated_at = datetime('now') WHERE handle = ?""", (role, date_str, handle), ) else: conn.execute( f"""INSERT INTO contributors (handle, agent_id, first_contribution, last_contribution, {role_col}, claims_merged) VALUES (?, ?, ?, ?, 1, CASE WHEN ? IN ('extractor', 'sourcer') THEN 1 ELSE 0 END)""", (handle, agent_id, date_str, date_str, role), ) # Recalculate tier recalculate_tier(conn, handle) def recalculate_tier(conn, handle: str): """Recalculate contributor tier based on config rules.""" from datetime import date as _date, datetime as _dt row = conn.execute( "SELECT claims_merged, challenges_survived, first_contribution, tier FROM contributors WHERE handle = ?", (handle,), ).fetchone() if not row: return current_tier = row["tier"] claims_merged = row["claims_merged"] or 0 challenges_survived = row["challenges_survived"] or 0 first_contribution = row["first_contribution"] days_since_first = 0 if first_contribution: try: first_date = _dt.strptime(first_contribution, "%Y-%m-%d").date() days_since_first = (_date.today() - first_date).days except ValueError: pass # Check veteran first (higher tier) vet_rules = config.CONTRIBUTOR_TIER_RULES["veteran"] if (claims_merged >= vet_rules["claims_merged"] and days_since_first >= vet_rules["min_days_since_first"] and challenges_survived >= vet_rules["challenges_survived"]): new_tier = "veteran" elif claims_merged >= config.CONTRIBUTOR_TIER_RULES["contributor"]["claims_merged"]: new_tier = "contributor" else: new_tier = "new" if new_tier != current_tier: conn.execute( "UPDATE contributors SET tier = ?, updated_at = datetime('now') WHERE handle = ?", (new_tier, handle), ) logger.info("Contributor %s: tier %s → %s", handle, current_tier, new_tier) db.audit( conn, "contributor", "tier_change", json.dumps({"handle": handle, "from": current_tier, "to": new_tier}), )