teleo-codex/ops/pipeline-v2/lib/extract.py
m3taversal 9925576c13 ship: add contributor attribution tracing to PR lifecycle
- Migration v19: submitted_by column on prs + sources tables
- extract.py: propagates proposed_by from source frontmatter → PR record
- merge.py: sets submitted_by from Forgejo author for human PRs
- dashboard_prs.py: redesigned with Contributor column, improved claim
  visibility in expanded rows, cost estimates, evaluator chain display
- dashboard_routes.py: submitted_by + source_path in pr-lifecycle API
- backfill_submitted_by.py: one-time backfill (1525/1777 PRs matched)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 14:56:03 +00:00

799 lines
28 KiB
Python

"""Extraction stage — automated claim extraction from queued sources.
Replaces extract-cron.sh with a Python module inside the pipeline daemon.
Processes unprocessed sources in inbox/queue/, extracts claims via LLM,
creates PRs on Forgejo, and archives sources on main.
Flow per source:
1. Read source frontmatter (domain, author, rationale)
2. Pre-screen: Haiku identifies themes, Qdrant finds prior art
3. Build KB index for dedup
4. Build extraction prompt (extraction_prompt.py)
5. Call Sonnet via OpenRouter
6. Parse JSON response
7. Post-extraction validation (post_extract.py)
8. Create branch, write claim/entity files, commit, push
9. Create PR on Forgejo via agent token
10. Archive source on main (worktree lock)
Design: one source at a time (sequential), up to MAX_SOURCES per cycle.
Uses the main worktree for reading + archival, extract worktree for branches.
Epimetheus owns this module. Leo reviews changes.
"""
import asyncio
import json
import logging
import os
import re
import secrets
from datetime import date
from pathlib import Path
from . import config
from .costs import record_usage
from .domains import agent_for_domain
from .extraction_prompt import build_extraction_prompt
from .forgejo import api as forgejo_api
from .llm import openrouter_call
from .post_extract import load_existing_claims_from_repo, validate_and_fix_claims
from .worktree_lock import async_main_worktree_lock
logger = logging.getLogger("pipeline.extract")
# Extraction worktree (separate from main to avoid conflicts)
EXTRACT_WORKTREE = config.BASE_DIR / "workspaces" / "extract"
# Max sources per cycle
MAX_SOURCES = int(os.environ.get("MAX_EXTRACT_SOURCES", "3"))
# KB index cache (rebuilt once per cycle, not per source)
_kb_index_cache: dict[str, str] = {}
_kb_index_timestamp: float = 0
KB_INDEX_TTL = 300 # 5 minutes
def _parse_source_frontmatter(content: str) -> dict:
"""Parse source file frontmatter. Returns dict of fields."""
if not content.startswith("---"):
return {}
end = content.find("---", 3)
if end == -1:
return {}
raw = content[3:end]
fm = {}
for line in raw.strip().split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
key, _, val = line.partition(":")
key = key.strip()
val = val.strip().strip('"').strip("'")
if val.lower() == "null" or val == "":
val = None
fm[key] = val
return fm
def _get_kb_index(domain: str) -> str:
"""Get KB index text for a domain. Uses cached /tmp/kb-indexes/ files."""
import time
global _kb_index_cache, _kb_index_timestamp
now = time.time()
if now - _kb_index_timestamp > KB_INDEX_TTL:
_kb_index_cache.clear()
_kb_index_timestamp = now
if domain in _kb_index_cache:
return _kb_index_cache[domain]
# Try pre-generated index files first
index_file = Path(f"/tmp/kb-indexes/{domain}.txt")
if index_file.exists():
text = index_file.read_text(encoding="utf-8")
_kb_index_cache[domain] = text
return text
# Fallback: build from repo
main = config.MAIN_WORKTREE
claims = []
domain_dir = main / "domains" / domain
if domain_dir.is_dir():
for f in domain_dir.glob("*.md"):
if not f.name.startswith("_"):
claims.append(f"- {f.name}")
text = f"## Claims in domains/{domain}/\n" + "\n".join(sorted(claims))
_kb_index_cache[domain] = text
return text
async def _git(*args, cwd: str = None, timeout: int = 60) -> tuple[int, str]:
"""Run a git command async. Returns (returncode, stdout+stderr)."""
proc = await asyncio.create_subprocess_exec(
"git", *args,
cwd=cwd or str(EXTRACT_WORKTREE),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
return -1, f"git {args[0]} timed out after {timeout}s"
output = (stdout or b"").decode().strip()
if stderr:
output += "\n" + stderr.decode().strip()
return proc.returncode, output
async def _pre_screen(source_content: str, source_title: str) -> str | None:
"""Run pre-screening: identify themes and find prior art.
Returns formatted prior art text, or None if pre-screening fails/unavailable.
Non-fatal — extraction proceeds without prior art if this fails.
"""
try:
from .pre_screen import identify_themes, PRIOR_ART_THRESHOLD
from .search import search
key_file = config.SECRETS_DIR / "openrouter-key"
if not key_file.exists():
return None
api_key = key_file.read_text().strip()
themes = identify_themes(source_content, api_key, source_title)
if not themes:
return None
# Search each theme against Qdrant
results = []
search_queries = themes + ([source_title] if source_title else [])
for query in search_queries[:5]:
try:
hits = search(query, limit=3, score_threshold=PRIOR_ART_THRESHOLD)
for hit in hits:
title = hit.get("title", hit.get("filename", ""))
score = hit.get("score", 0)
domain = hit.get("domain", "")
if title and score >= PRIOR_ART_THRESHOLD:
results.append(f"- [{score:.2f}] {title} (domain: {domain})")
except Exception:
continue
if not results:
return None
# Deduplicate
seen = set()
unique = []
for r in results:
if r not in seen:
seen.add(r)
unique.append(r)
return "\n".join(unique[:15])
except Exception:
logger.debug("Pre-screening failed (non-fatal)", exc_info=True)
return None
def _parse_extraction_json(text: str) -> dict | None:
"""Parse extraction JSON from LLM response. Handles markdown fencing."""
if not text:
return None
# Strip markdown code fences
text = text.strip()
if text.startswith("```"):
# Remove opening fence (```json or ```)
first_newline = text.index("\n") if "\n" in text else len(text)
text = text[first_newline + 1:]
if text.endswith("```"):
text = text[:-3]
text = text.strip()
try:
return json.loads(text)
except json.JSONDecodeError as e:
logger.warning("Failed to parse extraction JSON: %s", e)
# Try to find JSON object in text
match = re.search(r"\{[\s\S]+\}", text)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return None
def _build_claim_content(claim: dict, agent: str) -> str:
"""Build claim markdown file content from extraction JSON."""
today = date.today().isoformat()
domain = claim.get("domain", "")
title = claim.get("title", claim.get("filename", "").replace("-", " ").replace(".md", ""))
description = claim.get("description", "")
confidence = claim.get("confidence", "experimental")
source_ref = claim.get("source", "")
body = claim.get("body", "")
scope = claim.get("scope", "")
sourcer = claim.get("sourcer", "")
related = claim.get("related_claims", [])
lines = [
"---",
"type: claim",
f"domain: {domain}",
f'title: "{title}"',
f'description: "{description}"',
f"confidence: {confidence}",
f'source: "{source_ref}"',
f"created: {today}",
f"agent: {agent}",
]
if scope:
lines.append(f"scope: {scope}")
if sourcer:
lines.append(f'sourcer: "{sourcer}"')
if related:
lines.append("related_claims:")
for r in related:
lines.append(f' - "[[{r}]]"')
lines.append("---")
lines.append("")
lines.append(f"# {title}")
lines.append("")
if body:
lines.append(body)
lines.append("")
return "\n".join(lines)
def _build_entity_content(entity: dict, domain: str) -> str:
"""Build entity markdown file content from extraction JSON."""
today = date.today().isoformat()
entity_type = entity.get("entity_type", "company")
description = entity.get("content", "")
if description:
return description
name = entity.get("filename", "").replace("-", " ").replace(".md", "").title()
return f"""---
type: entity
entity_type: {entity_type}
domain: {domain}
description: ""
created: {today}
---
# {name}
## Timeline
{entity.get("timeline_entry", "")}
"""
async def _extract_one_source(
conn,
source_path: str,
source_content: str,
fm: dict,
existing_claims: set[str],
feedback: dict | None = None,
) -> tuple[int, int]:
"""Extract claims from a single source. Returns (succeeded, errors)."""
source_file = os.path.basename(source_path)
domain = fm.get("domain", "")
agent_name = agent_for_domain(domain)
agent_lower = agent_name.lower()
title = fm.get("title", source_file)
rationale = fm.get("rationale")
intake_tier = fm.get("intake_tier")
proposed_by = fm.get("proposed_by")
logger.info("Extracting: %s (domain: %s, agent: %s)", source_file, domain, agent_name)
# 1. Pre-screen (non-fatal)
prior_art = await _pre_screen(source_content, title)
if prior_art:
logger.info("Pre-screening found %d prior art items", prior_art.count("\n") + 1)
# 2. Build KB index
kb_index = _get_kb_index(domain)
# 3. Build extraction prompt
prompt = build_extraction_prompt(
source_file=source_path,
source_content=source_content,
domain=domain,
agent=agent_name,
kb_index=kb_index,
rationale=rationale,
intake_tier=intake_tier,
proposed_by=proposed_by,
prior_art=prior_art,
previous_feedback=feedback,
)
# 4. Call LLM (OpenRouter — not Claude Max CLI)
# EXTRACT_MODEL is "sonnet" (CLI name), use MODEL_SONNET_OR for OpenRouter
extract_model = config.MODEL_SONNET_OR
response, usage = await openrouter_call(
model=extract_model,
prompt=prompt,
timeout_sec=config.EXTRACT_TIMEOUT,
max_tokens=8192,
)
# Record usage
try:
record_usage(
conn,
model=extract_model,
stage="extract",
input_tokens=usage.get("prompt_tokens", 0),
output_tokens=usage.get("completion_tokens", 0),
backend="api",
)
except Exception:
logger.debug("Failed to record extraction usage", exc_info=True)
if not response:
logger.error("LLM extraction failed for %s — no response", source_file)
return 0, 1
# 5. Parse JSON
extraction = _parse_extraction_json(response)
if not extraction:
logger.error("Failed to parse extraction JSON for %s", source_file)
return 0, 1
claims_raw = extraction.get("claims", [])
entities_raw = extraction.get("entities", [])
enrichments = extraction.get("enrichments", [])
decisions = extraction.get("decisions", [])
facts = extraction.get("facts", [])
notes = extraction.get("extraction_notes", "")
logger.info(
"Extraction result for %s: %d claims, %d enrichments, %d entities, %d decisions",
source_file, len(claims_raw), len(enrichments), len(entities_raw), len(decisions),
)
# 6. Build claim file contents
claim_files = []
for c in claims_raw:
filename = c.get("filename", "")
if not filename:
continue
if not filename.endswith(".md"):
filename += ".md"
content = _build_claim_content(c, agent_lower)
claim_files.append({"filename": filename, "domain": c.get("domain", domain), "content": content})
# Build entity file contents
entity_files = []
for e in entities_raw:
filename = e.get("filename", "")
if not filename:
continue
if not filename.endswith(".md"):
filename += ".md"
action = e.get("action", "create")
if action == "create":
content = _build_entity_content(e, domain)
entity_files.append({"filename": filename, "domain": domain, "content": content})
# 7. Post-extraction validation
if claim_files:
kept_claims, rejected_claims, stats = validate_and_fix_claims(
claim_files, domain, agent_lower, existing_claims,
repo_root=str(config.MAIN_WORKTREE),
)
if rejected_claims:
logger.info(
"Post-extract rejected %d/%d claims for %s: %s",
len(rejected_claims), len(claim_files), source_file,
stats.get("rejections", [])[:5],
)
claim_files = kept_claims
if not claim_files and not entity_files:
logger.info("No valid claims/entities after validation for %s — archiving as null-result", source_file)
await _archive_source(source_path, domain, "null-result")
return 0, 0
# 8. Create branch, write files, commit, push
slug = Path(source_file).stem
branch = f"extract/{slug}-{secrets.token_hex(2)}"
# Prepare extract worktree
rc, _ = await _git("fetch", "origin", "main", cwd=str(EXTRACT_WORKTREE))
rc, _ = await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
rc, _ = await _git("reset", "--hard", "origin/main", cwd=str(EXTRACT_WORKTREE))
rc, _ = await _git("checkout", "-b", branch, cwd=str(EXTRACT_WORKTREE))
if rc != 0:
# Branch might already exist
await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
rc, out = await _git("checkout", "-b", branch, cwd=str(EXTRACT_WORKTREE))
if rc != 0:
logger.error("Failed to create branch %s: %s", branch, out)
return 0, 1
# Write claim files
worktree = EXTRACT_WORKTREE
files_written = []
for cf in claim_files:
domain_dir = worktree / "domains" / cf["domain"]
domain_dir.mkdir(parents=True, exist_ok=True)
fpath = domain_dir / cf["filename"]
fpath.write_text(cf["content"], encoding="utf-8")
files_written.append(f"domains/{cf['domain']}/{cf['filename']}")
for ef in entity_files:
entity_dir = worktree / "entities" / domain
entity_dir.mkdir(parents=True, exist_ok=True)
fpath = entity_dir / ef["filename"]
fpath.write_text(ef["content"], encoding="utf-8")
files_written.append(f"entities/{domain}/{ef['filename']}")
if not files_written:
logger.info("No files written for %s — cleaning up", source_file)
await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
await _archive_source(source_path, domain, "null-result")
return 0, 0
# Stage and commit
for f in files_written:
await _git("add", f, cwd=str(EXTRACT_WORKTREE))
commit_msg = (
f"{agent_lower}: extract claims from {slug}\n\n"
f"- Source: {source_path}\n"
f"- Domain: {domain}\n"
f"- Claims: {len(claim_files)}, Entities: {len(entity_files)}\n"
f"- Enrichments: {len(enrichments)}\n"
f"- Extracted by: pipeline ingest (OpenRouter {extract_model})\n\n"
f"Pentagon-Agent: {agent_name} <PIPELINE>"
)
rc, out = await _git("commit", "-m", commit_msg, cwd=str(EXTRACT_WORKTREE))
if rc != 0:
logger.error("Commit failed for %s: %s", branch, out)
await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
return 0, 1
# Push branch
rc, out = await _git("push", "-u", "origin", branch, cwd=str(EXTRACT_WORKTREE))
if rc != 0:
logger.error("Push failed for %s: %s", branch, out)
await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
return 0, 1
# 9. Create PR on Forgejo
agent_token_file = config.SECRETS_DIR / f"forgejo-{agent_lower}-token"
if not agent_token_file.exists():
agent_token_file = config.SECRETS_DIR / "forgejo-leo-token"
agent_token = agent_token_file.read_text().strip()
pr_title = f"{agent_lower}: extract claims from {slug}"
pr_body = (
f"## Automated Extraction\n\n"
f"**Source:** `{source_path}`\n"
f"**Domain:** {domain}\n"
f"**Agent:** {agent_name}\n"
f"**Model:** {extract_model}\n\n"
f"### Extraction Summary\n"
f"- **Claims:** {len(claim_files)}\n"
f"- **Entities:** {len(entity_files)}\n"
f"- **Enrichments:** {len(enrichments)}\n"
f"- **Decisions:** {len(decisions)}\n"
f"- **Facts:** {len(facts)}\n\n"
f"{notes}\n\n"
f"---\n"
f"*Extracted by pipeline ingest stage (replaces extract-cron.sh)*"
)
pr_result = await forgejo_api(
"POST",
f"/repos/{config.FORGEJO_OWNER}/{config.FORGEJO_REPO}/pulls",
body={"title": pr_title, "body": pr_body, "base": "main", "head": branch},
token=agent_token,
)
if pr_result and pr_result.get("number"):
pr_num = pr_result["number"]
logger.info("PR #%d created for %s (%d claims, %d entities)", pr_num, source_file, len(claim_files), len(entity_files))
# Store contributor attribution: who submitted this source?
# Priority: proposed_by field → intake_tier inference → "unknown"
if proposed_by:
contributor = proposed_by.strip().strip('"').strip("'")
elif intake_tier == "research-task":
contributor = f"{agent_name} (self-directed)"
elif intake_tier == "directed":
contributor = "directed (unknown)"
else:
contributor = "unknown"
# Build pipe-separated claim titles for the description field
claim_titles = " | ".join(
c.get("title", c.get("filename", "").replace("-", " ").replace(".md", ""))
for c in claims_raw if c.get("title") or c.get("filename")
)
# Upsert: if discover_external_prs already created the row, update it;
# if not, create a partial row that discover will complete.
try:
conn.execute(
"""INSERT INTO prs (number, branch, status, submitted_by, source_path, description)
VALUES (?, ?, 'open', ?, ?, ?)
ON CONFLICT(number) DO UPDATE SET
submitted_by = excluded.submitted_by,
source_path = excluded.source_path,
description = COALESCE(excluded.description, prs.description)""",
(pr_num, branch, contributor, source_path, claim_titles),
)
conn.commit()
except Exception:
logger.debug("Failed to upsert submitted_by for PR #%d", pr_num, exc_info=True)
# Also store on source record
try:
conn.execute(
"UPDATE sources SET submitted_by = ? WHERE path = ?",
(contributor, source_path),
)
conn.commit()
except Exception:
logger.debug("Failed to update source submitted_by", exc_info=True)
else:
logger.warning("PR creation may have failed for %s — response: %s", source_file, pr_result)
# Clean up extract worktree
await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
# 10. Archive source on main
await _archive_source(source_path, domain, "processed", agent_lower)
return 1, 0
async def _archive_source(
source_path: str,
domain: str,
status: str,
agent: str | None = None,
) -> None:
"""Move source from inbox/queue/ to archive (or null-result) on main.
Uses worktree lock to avoid conflicts with other main-writing processes.
"""
source_file = os.path.basename(source_path)
main = str(config.MAIN_WORKTREE)
try:
async with async_main_worktree_lock():
# Pull latest
await _git("pull", "--rebase", "origin", "main", cwd=main, timeout=30)
queue_path = Path(main) / "inbox" / "queue" / source_file
if not queue_path.exists():
logger.warning("Source %s not found in queue — may have been archived already", source_file)
return
if status == "null-result":
dest_dir = Path(main) / "inbox" / "null-result"
else:
dest_dir = Path(main) / "inbox" / "archive" / (domain or "unknown")
dest_dir.mkdir(parents=True, exist_ok=True)
dest_path = dest_dir / source_file
# Read and update frontmatter
content = queue_path.read_text(encoding="utf-8")
today = date.today().isoformat()
content = re.sub(r"^status: unprocessed", f"status: {status}", content, flags=re.MULTILINE)
if agent and "processed_by:" not in content:
content = re.sub(
r"(^status: \w+)",
rf"\1\nprocessed_by: {agent}\nprocessed_date: {today}",
content,
count=1,
flags=re.MULTILINE,
)
if "extraction_model:" not in content:
content = re.sub(
r"(^status: \w+.*?)(\n---)",
rf'\1\nextraction_model: "{config.MODEL_SONNET_OR}"\2',
content,
count=1,
flags=re.MULTILINE | re.DOTALL,
)
dest_path.write_text(content, encoding="utf-8")
queue_path.unlink()
# Git add, commit, push
await _git("add", "inbox/", cwd=main)
commit_msg = (
f"source: {source_file}{status}\n\n"
f"Pentagon-Agent: Epimetheus <PIPELINE>"
)
await _git("commit", "-m", commit_msg, cwd=main)
# Push with retry
for attempt in range(3):
rc, out = await _git("push", "origin", "main", cwd=main, timeout=30)
if rc == 0:
break
logger.warning("Push attempt %d failed: %s", attempt + 1, out)
await _git("pull", "--rebase", "origin", "main", cwd=main, timeout=30)
else:
logger.error("Failed to push source archival after 3 attempts")
except Exception:
logger.exception("Failed to archive source %s", source_file)
async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
"""Main extraction cycle — called by the pipeline daemon's ingest stage.
Finds unprocessed sources in inbox/queue/, extracts claims, creates PRs.
Returns (succeeded, errors) for circuit breaker tracking.
"""
main = config.MAIN_WORKTREE
# Find unprocessed sources
queue_dir = main / "inbox" / "queue"
if not queue_dir.exists():
return 0, 0
unprocessed = []
for f in sorted(queue_dir.glob("*.md")):
try:
content = f.read_text(encoding="utf-8")
fm = _parse_source_frontmatter(content)
if fm.get("status") == "unprocessed":
unprocessed.append((str(f.relative_to(main)), content, fm))
except Exception:
logger.debug("Failed to read source %s", f, exc_info=True)
if not unprocessed:
return 0, 0
# Filter out sources that already have open extraction PRs
open_pr_slugs = set()
try:
prs = await forgejo_api(
"GET",
f"/repos/{config.FORGEJO_OWNER}/{config.FORGEJO_REPO}/pulls?state=open&limit=50",
)
if prs:
for pr in prs:
head = pr.get("head", {}).get("ref", "")
if head.startswith("extract/"):
# Extract the source slug from branch name (extract/{slug}-{nonce})
slug_part = head[len("extract/"):]
# Remove the random suffix (last 5 chars: -{4-hex-chars})
if len(slug_part) > 5 and slug_part[-5] == "-":
slug_part = slug_part[:-5]
open_pr_slugs.add(slug_part)
except Exception:
logger.debug("Failed to check open PRs for dedup", exc_info=True)
if open_pr_slugs:
before = len(unprocessed)
unprocessed = [
(sp, c, f) for sp, c, f in unprocessed
if Path(sp).stem not in open_pr_slugs
]
skipped = before - len(unprocessed)
if skipped:
logger.info("Skipped %d source(s) with existing open PRs", skipped)
if not unprocessed:
return 0, 0
logger.info("Extract cycle: %d unprocessed source(s) found, processing up to %d", len(unprocessed), MAX_SOURCES)
# Load existing claims for dedup
existing_claims = load_existing_claims_from_repo(str(main))
# Ensure extract worktree exists and is clean
if not EXTRACT_WORKTREE.exists():
logger.error("Extract worktree not found at %s", EXTRACT_WORKTREE)
return 0, 1
total_ok = 0
total_err = 0
# ── Re-extraction: pick up sources that failed eval and have feedback ──
reextract_rows = conn.execute(
"""SELECT path, feedback FROM sources
WHERE status = 'needs_reextraction' AND feedback IS NOT NULL
ORDER BY updated_at ASC LIMIT ?""",
(max(1, MAX_SOURCES - len(unprocessed)),),
).fetchall()
for row in reextract_rows:
reex_path = row["path"]
# Source was archived — read from archive location
archive_base = main / "inbox" / "archive"
# Try to find the file in archive subdirs
reex_file = None
for subdir in archive_base.iterdir():
candidate = subdir / Path(reex_path).name
if candidate.exists():
reex_file = candidate
break
if not reex_file:
# Try original path as fallback
candidate = main / reex_path
if candidate.exists():
reex_file = candidate
if not reex_file:
logger.warning("Re-extraction: source %s not found on disk — skipping", reex_path)
continue
try:
reex_content = reex_file.read_text(encoding="utf-8")
reex_fm = _parse_source_frontmatter(reex_content)
reex_feedback = json.loads(row["feedback"]) if row["feedback"] else {}
logger.info("Re-extracting %s with feedback: %s", reex_path, list(reex_feedback.get("issues", [])))
conn.execute(
"UPDATE sources SET status = 'extracting', updated_at = datetime('now') WHERE path = ?",
(reex_path,),
)
conn.commit()
ok, err = await _extract_one_source(conn, reex_path, reex_content, reex_fm, existing_claims, feedback=reex_feedback)
total_ok += ok
total_err += err
if ok:
conn.execute(
"UPDATE sources SET status = 'extracted', updated_at = datetime('now') WHERE path = ?",
(reex_path,),
)
else:
conn.execute(
"UPDATE sources SET status = 'error', last_error = 're-extraction failed', updated_at = datetime('now') WHERE path = ?",
(reex_path,),
)
conn.commit()
except Exception:
logger.exception("Re-extraction failed for %s", reex_path)
total_err += 1
for source_path, content, fm in unprocessed[:MAX_SOURCES]:
try:
ok, err = await _extract_one_source(conn, source_path, content, fm, existing_claims)
total_ok += ok
total_err += err
except Exception:
logger.exception("Unhandled error extracting %s", source_path)
total_err += 1
# Brief pause between sources
await asyncio.sleep(2)
logger.info("Extract cycle complete: %d succeeded, %d errors", total_ok, total_err)
return total_ok, total_err