teleo-infrastructure/lib/entity_queue.py

"""Entity enrichment queue — decouple entity writes from extraction branches.

Problem: Entity updates on extraction branches cause merge conflicts because
multiple extraction branches modify the same entity file (e.g., metadao.md).
83% of near_duplicate false positives come from entity file modifications.

Solution: Extraction writes entity operations to a JSON queue file on the VPS.
A separate batch process reads the queue and applies operations to main.
Entity operations are commutative (timeline appends are order-independent),
so parallel extractions never conflict.

Flow:
1. openrouter-extract-v2.py → entity_queue.enqueue() instead of direct file writes
2. entity_batch.py (cron or pipeline stage) → entity_queue.dequeue() + apply to main
3. Commit entity changes to main directly (no PR needed for timeline appends)

Epimetheus owns this module. Leo reviews changes.
"""

import json
import logging
import os
import time
from datetime import date, datetime
from pathlib import Path

logger = logging.getLogger("pipeline.entity_queue")

# Default queue location (VPS)
DEFAULT_QUEUE_DIR = "/opt/teleo-eval/entity-queue"


def _queue_dir() -> Path:
    """Get the queue directory, creating it if needed."""
    d = Path(os.environ.get("ENTITY_QUEUE_DIR", DEFAULT_QUEUE_DIR))
    d.mkdir(parents=True, exist_ok=True)
    return d


def enqueue(entity: dict, source_file: str, agent: str) -> str:
    """Add an entity operation to the queue. Returns the queue entry ID.

    Args:
        entity: dict with keys: filename, domain, action (create|update),
                entity_type, content (for creates), timeline_entry (for updates)
        source_file: path to the source that produced this entity
        agent: agent name performing extraction

    Returns:
        Queue entry filename (for tracking)

    Raises:
        ValueError: if entity dict is missing required fields or has invalid action
    """
    # Validate required fields (Ganymede review)
    for field in ("filename", "domain", "action"):
        if not entity.get(field):
            raise ValueError(f"Entity missing required field: {field}")
    if entity["action"] not in ("create", "update"):
        raise ValueError(f"Invalid entity action: {entity['action']}")

    # Sanitize filename — prevent path traversal (Ganymede review)
    entity["filename"] = os.path.basename(entity["filename"])

    entry_id = f"{int(time.time() * 1000)}-{entity['filename'].replace('.md', '')}"
    entry = {
        "id": entry_id,
        "entity": entity,
        "source_file": os.path.basename(source_file),
        "agent": agent,
        "enqueued_at": datetime.now(tz=__import__('datetime').timezone.utc).isoformat(),
        "status": "pending",
    }

    queue_file = _queue_dir() / f"{entry_id}.json"
    with open(queue_file, "w") as f:
        json.dump(entry, f, indent=2)

    logger.info("Enqueued entity operation: %s (%s)", entity["filename"], entity.get("action", "?"))
    return entry_id


def dequeue(limit: int = 50) -> list[dict]:
    """Read pending queue entries, oldest first. Returns list of entry dicts.

    Does NOT remove entries — caller marks them processed after successful apply.
    """
    qdir = _queue_dir()
    entries = []

    for f in sorted(qdir.glob("*.json")):
        try:
            with open(f) as fh:
                entry = json.load(fh)
            if entry.get("status") == "pending":
                entry["_queue_path"] = str(f)
                entries.append(entry)
                if len(entries) >= limit:
                    break
        except (json.JSONDecodeError, KeyError) as e:
            logger.warning("Skipping malformed queue entry %s: %s", f.name, e)

    return entries


def mark_processed(entry: dict, result: str = "applied"):
    """Mark a queue entry as processed (or failed).

    Uses atomic write (tmp + rename) to prevent race conditions. (Ganymede review)
    """
    queue_path = entry.get("_queue_path")
    if not queue_path or not os.path.exists(queue_path):
        return

    entry["status"] = result
    entry["processed_at"] = datetime.now(tz=__import__('datetime').timezone.utc).isoformat()
    # Remove internal tracking field before writing
    path_backup = queue_path
    entry.pop("_queue_path", None)

    # Atomic write: tmp file + rename (Ganymede review — prevents race condition)
    tmp_path = queue_path + ".tmp"
    with open(tmp_path, "w") as f:
        json.dump(entry, f, indent=2)
    os.rename(tmp_path, queue_path)


def mark_failed(entry: dict, error: str):
    """Mark a queue entry as failed with error message."""
    entry["last_error"] = error
    mark_processed(entry, result="failed")


def queue_enrichment(
    target_claim: str,
    evidence: str,
    pr_number: int,
    original_title: str,
    similarity: float,
    domain: str,
) -> str:
    """Queue an enrichment for an existing claim. Applied by entity_batch alongside entity updates.

    Used by the substantive fixer for near-duplicate auto-conversion.
    Single writer pattern — avoids race conditions with direct main writes. (Ganymede)
    """
    entry_id = f"{int(time.time() * 1000)}-enrichment-{os.path.basename(target_claim).replace('.md', '')}"
    entry = {
        "id": entry_id,
        "type": "enrichment",
        "target_claim": target_claim,
        "evidence": evidence,
        "pr_number": pr_number,
        "original_title": original_title,
        "similarity": similarity,
        "domain": domain,
        "enqueued_at": datetime.now(tz=__import__('datetime').timezone.utc).isoformat(),
        "status": "pending",
    }

    queue_file = _queue_dir() / f"{entry_id}.json"
    with open(queue_file, "w") as f:
        json.dump(entry, f, indent=2)

    logger.info("Enqueued enrichment: PR #%d → %s (sim=%.2f)", pr_number, target_claim, similarity)
    return entry_id


def cleanup(max_age_hours: int = 24):
    """Remove processed/failed entries older than max_age_hours."""
    qdir = _queue_dir()
    cutoff = time.time() - (max_age_hours * 3600)
    removed = 0

    for f in qdir.glob("*.json"):
        try:
            with open(f) as fh:
                entry = json.load(fh)
            if entry.get("status") in ("applied", "failed"):
                if f.stat().st_mtime < cutoff:
                    f.unlink()
                    removed += 1
        except Exception:
            pass

    if removed:
        logger.info("Cleaned up %d old queue entries", removed)
    return removed


def queue_stats() -> dict:
    """Get queue statistics for health monitoring."""
    qdir = _queue_dir()
    stats = {"pending": 0, "applied": 0, "failed": 0, "total": 0}

    for f in qdir.glob("*.json"):
        try:
            with open(f) as fh:
                entry = json.load(fh)
            status = entry.get("status", "unknown")
            stats[status] = stats.get(status, 0) + 1
            stats["total"] += 1
        except Exception:
            pass

    return stats