teleo-codex/ops/pipeline-v2/lib/entity_batch.py

"""Entity batch processor — applies queued entity operations to main.

Reads from entity_queue, applies creates/updates to the main worktree,
commits directly to main. No PR needed for entity timeline appends —
they're factual, commutative, and low-risk.

Entity creates (new entity files) go through PR review like claims.
Entity updates (timeline appends) commit directly — they're additive
and recoverable from source archives if wrong.

Runs as part of the pipeline's ingest stage or as a standalone cron.

Epimetheus owns this module. Leo reviews changes. Rhea deploys.
"""

import asyncio
import json
import logging
import os
import re
from datetime import date
from pathlib import Path

from . import config, db
from .entity_queue import cleanup, dequeue, mark_failed, mark_processed

logger = logging.getLogger("pipeline.entity_batch")


def _read_file(path: str) -> str:
    try:
        with open(path) as f:
            return f.read()
    except FileNotFoundError:
        return ""


async def _git(*args, cwd: str = None, timeout: int = 60) -> tuple[int, str]:
    """Run a git command async."""
    proc = await asyncio.create_subprocess_exec(
        "git", *args,
        cwd=cwd or str(config.MAIN_WORKTREE),
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    try:
        stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
    except asyncio.TimeoutError:
        proc.kill()
        await proc.wait()
        return -1, f"git {args[0]} timed out after {timeout}s"
    output = (stdout or b"").decode().strip()
    if stderr:
        output += "\n" + stderr.decode().strip()
    return proc.returncode, output


def _apply_timeline_entry(entity_path: str, timeline_entry: str) -> tuple[bool, str]:
    """Append a timeline entry to an existing entity file.

    Returns (success, message).
    """
    if not os.path.exists(entity_path):
        return False, f"entity file not found: {entity_path}"

    content = _read_file(entity_path)
    if not content:
        return False, f"entity file empty: {entity_path}"

    # Check for duplicate timeline entry
    if timeline_entry.strip() in content:
        return False, "duplicate timeline entry"

    # Find or create Timeline section
    if "## Timeline" in content:
        lines = content.split("\n")
        insert_idx = len(lines)
        in_timeline = False
        for i, line in enumerate(lines):
            if line.strip().startswith("## Timeline"):
                in_timeline = True
                continue
            if in_timeline and line.strip().startswith("## "):
                insert_idx = i
                break
        lines.insert(insert_idx, timeline_entry)
        updated = "\n".join(lines)
    else:
        updated = content.rstrip() + "\n\n## Timeline\n\n" + timeline_entry + "\n"

    with open(entity_path, "w") as f:
        f.write(updated)

    return True, "timeline entry appended"


def _apply_claim_enrichment(claim_path: str, evidence: str, pr_number: int,
                            original_title: str, similarity: float) -> tuple[bool, str]:
    """Append auto-enrichment evidence to an existing claim file.

    Used for near-duplicate auto-conversion. (Ganymede: route through entity_batch)
    """
    if not os.path.exists(claim_path):
        return False, f"target claim not found: {claim_path}"

    content = _read_file(claim_path)
    if not content:
        return False, f"target claim empty: {claim_path}"

    # Dedup: skip if this PR already enriched this claim (idempotency)
    if f"PR #{pr_number}" in content:
        return False, f"already enriched by PR #{pr_number}"

    enrichment_block = (
        f"\n\n### Auto-enrichment (near-duplicate conversion, similarity={similarity:.2f})\n"
        f"*Source: PR #{pr_number} — \"{original_title}\"*\n"
        f"*Auto-converted by substantive fixer. Review: revert if this evidence doesn't belong here.*\n\n"
        f"{evidence}\n"
    )

    if "\n---\n" in content:
        parts = content.rsplit("\n---\n", 1)
        updated = parts[0] + enrichment_block + "\n---\n" + parts[1]
    else:
        updated = content + enrichment_block

    with open(claim_path, "w") as f:
        f.write(updated)

    return True, "enrichment appended"


def _apply_entity_create(entity_path: str, content: str) -> tuple[bool, str]:
    """Create a new entity file. Returns (success, message)."""
    if os.path.exists(entity_path):
        return False, f"entity already exists: {entity_path}"

    os.makedirs(os.path.dirname(entity_path), exist_ok=True)
    with open(entity_path, "w") as f:
        f.write(content)

    return True, "entity created"


async def apply_batch(conn=None, max_entries: int = 50) -> tuple[int, int]:
    """Process the entity queue. Returns (applied, failed).

    1. Pull latest main
    2. Read pending queue entries
    3. Apply each operation to the main worktree
    4. Commit all changes in one batch commit
    5. Push to origin
    """
    main_wt = str(config.MAIN_WORKTREE)

    # Ensure we're on main branch — batch script may have left worktree on an extract branch
    await _git("checkout", "main", cwd=main_wt)

    # Pull latest main
    rc, out = await _git("fetch", "origin", "main", cwd=main_wt)
    if rc != 0:
        logger.error("Failed to fetch main: %s", out)
        return 0, 0
    rc, out = await _git("reset", "--hard", "origin/main", cwd=main_wt)
    if rc != 0:
        logger.error("Failed to reset main: %s", out)
        return 0, 0

    # Read queue
    entries = dequeue(limit=max_entries)
    if not entries:
        return 0, 0

    logger.info("Processing %d entity queue entries", len(entries))

    applied_entries: list[dict] = []  # Track for post-push marking (Ganymede review)
    failed = 0
    files_changed: set[str] = set()

    for entry in entries:
        # Handle enrichments (from substantive fixer near-duplicate conversion)
        if entry.get("type") == "enrichment":
            target = entry.get("target_claim", "")
            evidence = entry.get("evidence", "")
            domain = entry.get("domain", "")
            if not target or not evidence:
                mark_failed(entry, "enrichment missing target or evidence")
                failed += 1
                continue
            claim_path = os.path.join(main_wt, "domains", domain, os.path.basename(target))
            rel_path = os.path.join("domains", domain, os.path.basename(target))
            try:
                ok, msg = _apply_claim_enrichment(
                    claim_path, evidence, entry.get("pr_number", 0),
                    entry.get("original_title", ""), entry.get("similarity", 0),
                )
                if ok:
                    files_changed.add(rel_path)
                    applied_entries.append(entry)
                    logger.info("Applied enrichment to %s: %s", target, msg)
                else:
                    mark_failed(entry, msg)
                    failed += 1
            except Exception as e:
                logger.exception("Failed enrichment on %s", target)
                mark_failed(entry, str(e))
                failed += 1
            continue

        # Handle entity operations
        entity = entry.get("entity", {})
        filename = entity.get("filename", "")
        domain = entity.get("domain", "")
        action = entity.get("action", "")

        if not filename or not domain:
            mark_failed(entry, "missing filename or domain")
            failed += 1
            continue

        # Sanitize filename — prevent path traversal (Ganymede review)
        filename = os.path.basename(filename)

        entity_dir = os.path.join(main_wt, "entities", domain)
        entity_path = os.path.join(entity_dir, filename)
        rel_path = os.path.join("entities", domain, filename)

        try:
            if action == "update":
                timeline = entity.get("timeline_entry", "")
                if not timeline:
                    mark_failed(entry, "update with no timeline_entry")
                    failed += 1
                    continue

                ok, msg = _apply_timeline_entry(entity_path, timeline)
                if ok:
                    files_changed.add(rel_path)
                    applied_entries.append(entry)
                    logger.debug("Applied update to %s: %s", filename, msg)
                else:
                    mark_failed(entry, msg)
                    failed += 1

            elif action == "create":
                content = entity.get("content", "")
                if not content:
                    mark_failed(entry, "create with no content")
                    failed += 1
                    continue

                # If entity already exists, try to apply as timeline update instead
                if os.path.exists(entity_path):
                    timeline = entity.get("timeline_entry", "")
                    if timeline:
                        ok, msg = _apply_timeline_entry(entity_path, timeline)
                        if ok:
                            files_changed.add(rel_path)
                            applied_entries.append(entry)
                        else:
                            mark_failed(entry, f"create→update fallback: {msg}")
                            failed += 1
                    else:
                        mark_failed(entry, "entity exists, no timeline to append")
                        failed += 1
                    continue

                ok, msg = _apply_entity_create(entity_path, content)
                if ok:
                    files_changed.add(rel_path)
                    applied_entries.append(entry)
                    logger.debug("Created entity %s", filename)
                else:
                    mark_failed(entry, msg)
                    failed += 1

            else:
                mark_failed(entry, f"unknown action: {action}")
                failed += 1

        except Exception as e:
            logger.exception("Failed to apply entity %s", filename)
            mark_failed(entry, str(e))
            failed += 1

    applied = len(applied_entries)

    # Commit and push if any files changed
    if files_changed:
        # Stage changed files
        for f in files_changed:
            await _git("add", f, cwd=main_wt)

        # Commit
        commit_msg = (
            f"entity-batch: update {len(files_changed)} entities\n\n"
            f"- Applied {applied} entity operations from queue\n"
            f"- Files: {', '.join(sorted(files_changed)[:10])}"
            f"{'...' if len(files_changed) > 10 else ''}\n\n"
            f"Pentagon-Agent: Epimetheus <968B2991-E2DF-4006-B962-F5B0A0CC8ACA>"
        )
        rc, out = await _git("commit", "-m", commit_msg, cwd=main_wt)
        if rc != 0:
            logger.error("Entity batch commit failed: %s", out)
            return applied, failed

        # Push with retry — main advances frequently from merge module.
        # Pull-rebase before each attempt to catch up with remote.
        push_ok = False
        for attempt in range(3):
            # Always pull-rebase before pushing to catch up with remote main
            rc, out = await _git("pull", "--rebase", "origin", "main", cwd=main_wt, timeout=30)
            if rc != 0:
                logger.warning("Entity batch pull-rebase failed (attempt %d): %s", attempt + 1, out)
                await _git("rebase", "--abort", cwd=main_wt)
                await _git("reset", "--hard", "origin/main", cwd=main_wt)
                return 0, failed + applied

            rc, out = await _git("push", "origin", "main", cwd=main_wt, timeout=30)
            if rc == 0:
                push_ok = True
                break
            logger.warning("Entity batch push failed (attempt %d), retrying: %s", attempt + 1, out[:100])
            await asyncio.sleep(2)  # Brief pause before retry

        if not push_ok:
            logger.error("Entity batch push failed after 3 attempts")
            await _git("reset", "--hard", "origin/main", cwd=main_wt)
            return 0, failed + applied

        # Push succeeded — NOW mark entries as processed (Ganymede review)
        for entry in applied_entries:
            mark_processed(entry)

        logger.info(
            "Entity batch: committed %d file changes (%d applied, %d failed)",
            len(files_changed), applied, failed,
        )

        # Audit
        if conn:
            db.audit(
                conn, "entity_batch", "batch_applied",
                json.dumps({
                    "applied": applied, "failed": failed,
                    "files": sorted(files_changed)[:20],
                }),
            )

    # Cleanup old entries
    cleanup(max_age_hours=24)

    return applied, failed


async def entity_batch_cycle(conn, max_workers=None) -> tuple[int, int]:
    """Pipeline stage entry point. Called by teleo-pipeline.py's ingest stage."""
    return await apply_batch(conn)