teleo-codex/ops/pipeline-v2/lib/cross_domain.py
m3taversal 2c0d428dc0 Add Phase 1+2 instrumentation: review records, cascade automation, cross-domain index, agent state
Phase 1 — Audit logging infrastructure:
- review_records table (migration v12) capturing every eval verdict with outcome, rejection reason, disagreement type
- Cascade automation: auto-flag dependent beliefs/positions when merged claims change
- Merge frontmatter stamps: last_review metadata on merged claim files

Phase 2 — Cross-domain and state tracking:
- Cross-domain citation index: entity overlap detection across domains on every merge
- Agent-state schema v1: file-backed state for VPS agents (memory, tasks, inbox, metrics)
- Cascade completion tracking: process-cascade-inbox.py logs review outcomes
- research-session.sh: state hooks + cascade processing integration

All changes are live on VPS. This commit brings the code under version control for review.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 10:50:49 +00:00

230 lines
7.9 KiB
Python

"""Cross-domain citation index — detect entity overlap across domains.
Hook point: called from merge.py after cascade_after_merge.
After a claim merges, checks if its referenced entities also appear in claims
from other domains. Logs connections to audit_log for silo detection.
Two detection methods:
1. Entity name matching — entity names appearing in claim body text (word-boundary)
2. Source overlap — claims citing the same source archive files
At ~600 claims and ~100 entities, full scan per merge takes <1 second.
"""
import asyncio
import json
import logging
import os
import re
from pathlib import Path
logger = logging.getLogger("pipeline.cross_domain")
# Minimum entity name length to avoid false positives (ORE, QCX, etc)
MIN_ENTITY_NAME_LEN = 4
# Entity names that are common English words — skip to avoid false positives
ENTITY_STOPLIST = {"versus", "island", "loyal", "saber", "nebula", "helium", "coal", "snapshot", "dropout"}
def _build_entity_names(worktree: Path) -> dict[str, str]:
"""Build mapping of entity_slug -> display_name from entity files."""
names = {}
entity_dir = worktree / "entities"
if not entity_dir.exists():
return names
for md_file in entity_dir.rglob("*.md"):
if md_file.name.startswith("_"):
continue
try:
content = md_file.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
continue
for line in content.split("\n"):
if line.startswith("name:"):
name = line.split(":", 1)[1].strip().strip('"').strip("'")
if len(name) >= MIN_ENTITY_NAME_LEN and name.lower() not in ENTITY_STOPLIST:
names[md_file.stem] = name
break
return names
def _compile_entity_patterns(entity_names: dict[str, str]) -> dict[str, re.Pattern]:
"""Pre-compile word-boundary regex for each entity name."""
patterns = {}
for slug, name in entity_names.items():
try:
patterns[slug] = re.compile(r'\b' + re.escape(name) + r'\b', re.IGNORECASE)
except re.error:
continue
return patterns
def _extract_source_refs(content: str) -> set[str]:
"""Extract source archive references ([[YYYY-MM-DD-...]]) from content."""
return set(re.findall(r"\[\[(20\d{2}-\d{2}-\d{2}-[^\]]+)\]\]", content))
def _find_entity_mentions(content: str, patterns: dict[str, re.Pattern]) -> set[str]:
"""Find entity slugs whose names appear in the content (word-boundary match)."""
found = set()
for slug, pat in patterns.items():
if pat.search(content):
found.add(slug)
return found
def _scan_domain_claims(worktree: Path, patterns: dict[str, re.Pattern]) -> dict[str, list[dict]]:
"""Build domain -> [claim_info] mapping for all claims."""
domain_claims = {}
domains_dir = worktree / "domains"
if not domains_dir.exists():
return domain_claims
for domain_dir in domains_dir.iterdir():
if not domain_dir.is_dir():
continue
claims = []
for claim_file in domain_dir.glob("*.md"):
if claim_file.name.startswith("_") or claim_file.name == "directory.md":
continue
try:
content = claim_file.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
continue
claims.append({
"slug": claim_file.stem,
"entities": _find_entity_mentions(content, patterns),
"sources": _extract_source_refs(content),
})
domain_claims[domain_dir.name] = claims
return domain_claims
async def cross_domain_after_merge(
main_sha: str,
branch_sha: str,
pr_num: int,
main_worktree: Path,
conn=None,
) -> int:
"""Detect cross-domain entity/source overlap for claims changed in this merge.
Returns the number of cross-domain connections found.
"""
# 1. Get changed files
proc = await asyncio.create_subprocess_exec(
"git", "diff", "--name-only", "--diff-filter=ACMR",
main_sha, branch_sha,
cwd=str(main_worktree),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
logger.warning("cross_domain: git diff timed out")
return 0
if proc.returncode != 0:
return 0
diff_files = [f for f in stdout.decode().strip().split("\n") if f]
# 2. Filter to claim files
changed_claims = []
for fpath in diff_files:
if not fpath.endswith(".md") or not fpath.startswith("domains/"):
continue
parts = fpath.split("/")
if len(parts) < 3:
continue
basename = os.path.basename(fpath)
if basename.startswith("_") or basename == "directory.md":
continue
changed_claims.append({"path": fpath, "domain": parts[1], "slug": Path(basename).stem})
if not changed_claims:
return 0
# 3. Build entity patterns and scan all claims
entity_names = _build_entity_names(main_worktree)
if not entity_names:
return 0
patterns = _compile_entity_patterns(entity_names)
domain_claims = _scan_domain_claims(main_worktree, patterns)
# 4. For each changed claim, find cross-domain connections
total_connections = 0
all_connections = []
for claim in changed_claims:
claim_path = main_worktree / claim["path"]
try:
content = claim_path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
continue
my_entities = _find_entity_mentions(content, patterns)
my_sources = _extract_source_refs(content)
if not my_entities and not my_sources:
continue
connections = []
for other_domain, other_claims in domain_claims.items():
if other_domain == claim["domain"]:
continue
for other in other_claims:
shared_entities = my_entities & other["entities"]
shared_sources = my_sources & other["sources"]
# Threshold: >=2 shared entities, OR 1 entity + 1 source
entity_count = len(shared_entities)
source_count = len(shared_sources)
if entity_count >= 2 or (entity_count >= 1 and source_count >= 1):
connections.append({
"other_claim": other["slug"],
"other_domain": other_domain,
"shared_entities": sorted(shared_entities)[:5],
"shared_sources": sorted(shared_sources)[:3],
})
if connections:
total_connections += len(connections)
all_connections.append({
"claim": claim["slug"],
"domain": claim["domain"],
"connections": connections[:10],
})
logger.info(
"cross_domain: %s (%s) has %d cross-domain connections",
claim["slug"], claim["domain"], len(connections),
)
# 5. Log to audit_log
if all_connections and conn is not None:
try:
conn.execute(
"INSERT INTO audit_log (stage, event, detail) VALUES (?, ?, ?)",
("cross_domain", "connections_found", json.dumps({
"pr": pr_num,
"total_connections": total_connections,
"claims_with_connections": len(all_connections),
"details": all_connections[:10],
})),
)
except Exception:
logger.exception("cross_domain: audit_log write failed (non-fatal)")
if total_connections:
logger.info(
"cross_domain: PR #%d%d connections across %d claims",
pr_num, total_connections, len(all_connections),
)
return total_connections