teleo-codex/ops/pipeline-v2/lib/entity_batch.py
m3taversal 05d74d5e32 sync: import all VPS pipeline + diagnostics code as baseline
Imports 67 files from VPS (/opt/teleo-eval/) into repo as the single source
of truth. Previously only 8 of 67 files existed in repo — the rest were
deployed directly to VPS via SCP, causing massive drift.

Includes:
- pipeline/lib/: 33 Python modules (daemon core, extraction, evaluation, merge, cascade, cross-domain, costs, attribution, etc.)
- pipeline/: main daemon (teleo-pipeline.py), reweave.py, batch-extract-50.sh
- diagnostics/: 19 files (4-page dashboard, alerting, daily digest, review queue, tier1 metrics)
- agent-state/: bootstrap, lib-state, cascade inbox processor, schema
- systemd/: service unit files for reference
- deploy.sh: rsync-based deploy with --dry-run, syntax checks, dirty-tree gate
- research-session.sh: updated with Step 8.5 digest + cascade inbox processing

No new code written — all files are exact copies from VPS as of 2026-04-06.
From this point forward: edit in repo, commit, then deploy.sh.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 00:00:00 +01:00

358 lines
13 KiB
Python

"""Entity batch processor — applies queued entity operations to main.
Reads from entity_queue, applies creates/updates to the main worktree,
commits directly to main. No PR needed for entity timeline appends —
they're factual, commutative, and low-risk.
Entity creates (new entity files) go through PR review like claims.
Entity updates (timeline appends) commit directly — they're additive
and recoverable from source archives if wrong.
Runs as part of the pipeline's ingest stage or as a standalone cron.
Epimetheus owns this module. Leo reviews changes. Rhea deploys.
"""
import asyncio
import json
import logging
import os
import re
from datetime import date
from pathlib import Path
from . import config, db
from .entity_queue import cleanup, dequeue, mark_failed, mark_processed
logger = logging.getLogger("pipeline.entity_batch")
def _read_file(path: str) -> str:
try:
with open(path) as f:
return f.read()
except FileNotFoundError:
return ""
async def _git(*args, cwd: str = None, timeout: int = 60) -> tuple[int, str]:
"""Run a git command async."""
proc = await asyncio.create_subprocess_exec(
"git", *args,
cwd=cwd or str(config.MAIN_WORKTREE),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
return -1, f"git {args[0]} timed out after {timeout}s"
output = (stdout or b"").decode().strip()
if stderr:
output += "\n" + stderr.decode().strip()
return proc.returncode, output
def _apply_timeline_entry(entity_path: str, timeline_entry: str) -> tuple[bool, str]:
"""Append a timeline entry to an existing entity file.
Returns (success, message).
"""
if not os.path.exists(entity_path):
return False, f"entity file not found: {entity_path}"
content = _read_file(entity_path)
if not content:
return False, f"entity file empty: {entity_path}"
# Check for duplicate timeline entry
if timeline_entry.strip() in content:
return False, "duplicate timeline entry"
# Find or create Timeline section
if "## Timeline" in content:
lines = content.split("\n")
insert_idx = len(lines)
in_timeline = False
for i, line in enumerate(lines):
if line.strip().startswith("## Timeline"):
in_timeline = True
continue
if in_timeline and line.strip().startswith("## "):
insert_idx = i
break
lines.insert(insert_idx, timeline_entry)
updated = "\n".join(lines)
else:
updated = content.rstrip() + "\n\n## Timeline\n\n" + timeline_entry + "\n"
with open(entity_path, "w") as f:
f.write(updated)
return True, "timeline entry appended"
def _apply_claim_enrichment(claim_path: str, evidence: str, pr_number: int,
original_title: str, similarity: float) -> tuple[bool, str]:
"""Append auto-enrichment evidence to an existing claim file.
Used for near-duplicate auto-conversion. (Ganymede: route through entity_batch)
"""
if not os.path.exists(claim_path):
return False, f"target claim not found: {claim_path}"
content = _read_file(claim_path)
if not content:
return False, f"target claim empty: {claim_path}"
# Dedup: skip if this PR already enriched this claim (idempotency)
if f"PR #{pr_number}" in content:
return False, f"already enriched by PR #{pr_number}"
enrichment_block = (
f"\n\n### Auto-enrichment (near-duplicate conversion, similarity={similarity:.2f})\n"
f"*Source: PR #{pr_number}\"{original_title}\"*\n"
f"*Auto-converted by substantive fixer. Review: revert if this evidence doesn't belong here.*\n\n"
f"{evidence}\n"
)
if "\n---\n" in content:
parts = content.rsplit("\n---\n", 1)
updated = parts[0] + enrichment_block + "\n---\n" + parts[1]
else:
updated = content + enrichment_block
with open(claim_path, "w") as f:
f.write(updated)
return True, "enrichment appended"
def _apply_entity_create(entity_path: str, content: str) -> tuple[bool, str]:
"""Create a new entity file. Returns (success, message)."""
if os.path.exists(entity_path):
return False, f"entity already exists: {entity_path}"
os.makedirs(os.path.dirname(entity_path), exist_ok=True)
with open(entity_path, "w") as f:
f.write(content)
return True, "entity created"
async def apply_batch(conn=None, max_entries: int = 50) -> tuple[int, int]:
"""Process the entity queue. Returns (applied, failed).
1. Pull latest main
2. Read pending queue entries
3. Apply each operation to the main worktree
4. Commit all changes in one batch commit
5. Push to origin
"""
main_wt = str(config.MAIN_WORKTREE)
# Ensure we're on main branch — batch script may have left worktree on an extract branch
await _git("checkout", "main", cwd=main_wt)
# Pull latest main
rc, out = await _git("fetch", "origin", "main", cwd=main_wt)
if rc != 0:
logger.error("Failed to fetch main: %s", out)
return 0, 0
rc, out = await _git("reset", "--hard", "origin/main", cwd=main_wt)
if rc != 0:
logger.error("Failed to reset main: %s", out)
return 0, 0
# Read queue
entries = dequeue(limit=max_entries)
if not entries:
return 0, 0
logger.info("Processing %d entity queue entries", len(entries))
applied_entries: list[dict] = [] # Track for post-push marking (Ganymede review)
failed = 0
files_changed: set[str] = set()
for entry in entries:
# Handle enrichments (from substantive fixer near-duplicate conversion)
if entry.get("type") == "enrichment":
target = entry.get("target_claim", "")
evidence = entry.get("evidence", "")
domain = entry.get("domain", "")
if not target or not evidence:
mark_failed(entry, "enrichment missing target or evidence")
failed += 1
continue
claim_path = os.path.join(main_wt, "domains", domain, os.path.basename(target))
rel_path = os.path.join("domains", domain, os.path.basename(target))
try:
ok, msg = _apply_claim_enrichment(
claim_path, evidence, entry.get("pr_number", 0),
entry.get("original_title", ""), entry.get("similarity", 0),
)
if ok:
files_changed.add(rel_path)
applied_entries.append(entry)
logger.info("Applied enrichment to %s: %s", target, msg)
else:
mark_failed(entry, msg)
failed += 1
except Exception as e:
logger.exception("Failed enrichment on %s", target)
mark_failed(entry, str(e))
failed += 1
continue
# Handle entity operations
entity = entry.get("entity", {})
filename = entity.get("filename", "")
domain = entity.get("domain", "")
action = entity.get("action", "")
if not filename or not domain:
mark_failed(entry, "missing filename or domain")
failed += 1
continue
# Sanitize filename — prevent path traversal (Ganymede review)
filename = os.path.basename(filename)
entity_dir = os.path.join(main_wt, "entities", domain)
entity_path = os.path.join(entity_dir, filename)
rel_path = os.path.join("entities", domain, filename)
try:
if action == "update":
timeline = entity.get("timeline_entry", "")
if not timeline:
mark_failed(entry, "update with no timeline_entry")
failed += 1
continue
ok, msg = _apply_timeline_entry(entity_path, timeline)
if ok:
files_changed.add(rel_path)
applied_entries.append(entry)
logger.debug("Applied update to %s: %s", filename, msg)
else:
mark_failed(entry, msg)
failed += 1
elif action == "create":
content = entity.get("content", "")
if not content:
mark_failed(entry, "create with no content")
failed += 1
continue
# If entity already exists, try to apply as timeline update instead
if os.path.exists(entity_path):
timeline = entity.get("timeline_entry", "")
if timeline:
ok, msg = _apply_timeline_entry(entity_path, timeline)
if ok:
files_changed.add(rel_path)
applied_entries.append(entry)
else:
mark_failed(entry, f"create→update fallback: {msg}")
failed += 1
else:
mark_failed(entry, "entity exists, no timeline to append")
failed += 1
continue
ok, msg = _apply_entity_create(entity_path, content)
if ok:
files_changed.add(rel_path)
applied_entries.append(entry)
logger.debug("Created entity %s", filename)
else:
mark_failed(entry, msg)
failed += 1
else:
mark_failed(entry, f"unknown action: {action}")
failed += 1
except Exception as e:
logger.exception("Failed to apply entity %s", filename)
mark_failed(entry, str(e))
failed += 1
applied = len(applied_entries)
# Commit and push if any files changed
if files_changed:
# Stage changed files
for f in files_changed:
await _git("add", f, cwd=main_wt)
# Commit
commit_msg = (
f"entity-batch: update {len(files_changed)} entities\n\n"
f"- Applied {applied} entity operations from queue\n"
f"- Files: {', '.join(sorted(files_changed)[:10])}"
f"{'...' if len(files_changed) > 10 else ''}\n\n"
f"Pentagon-Agent: Epimetheus <968B2991-E2DF-4006-B962-F5B0A0CC8ACA>"
)
rc, out = await _git("commit", "-m", commit_msg, cwd=main_wt)
if rc != 0:
logger.error("Entity batch commit failed: %s", out)
return applied, failed
# Push with retry — main advances frequently from merge module.
# Pull-rebase before each attempt to catch up with remote.
push_ok = False
for attempt in range(3):
# Always pull-rebase before pushing to catch up with remote main
rc, out = await _git("pull", "--rebase", "origin", "main", cwd=main_wt, timeout=30)
if rc != 0:
logger.warning("Entity batch pull-rebase failed (attempt %d): %s", attempt + 1, out)
await _git("rebase", "--abort", cwd=main_wt)
await _git("reset", "--hard", "origin/main", cwd=main_wt)
return 0, failed + applied
rc, out = await _git("push", "origin", "main", cwd=main_wt, timeout=30)
if rc == 0:
push_ok = True
break
logger.warning("Entity batch push failed (attempt %d), retrying: %s", attempt + 1, out[:100])
await asyncio.sleep(2) # Brief pause before retry
if not push_ok:
logger.error("Entity batch push failed after 3 attempts")
await _git("reset", "--hard", "origin/main", cwd=main_wt)
return 0, failed + applied
# Push succeeded — NOW mark entries as processed (Ganymede review)
for entry in applied_entries:
mark_processed(entry)
logger.info(
"Entity batch: committed %d file changes (%d applied, %d failed)",
len(files_changed), applied, failed,
)
# Audit
if conn:
db.audit(
conn, "entity_batch", "batch_applied",
json.dumps({
"applied": applied, "failed": failed,
"files": sorted(files_changed)[:20],
}),
)
# Cleanup old entries
cleanup(max_age_hours=24)
return applied, failed
async def entity_batch_cycle(conn, max_workers=None) -> tuple[int, int]:
"""Pipeline stage entry point. Called by teleo-pipeline.py's ingest stage."""
return await apply_batch(conn)