fix: close cooldown-dependence gaps in extract.py (Ganymede review)
Some checks are pending
CI / lint-and-test (push) Waiting to run
Some checks are pending
CI / lint-and-test (push) Waiting to run
Three targeted fixes from Ganymede's review of commit 469cb7f:
BUG #1 — Success path now updates sources.status='extracting' before PR
creation, so queue scan's DB-authoritative filter catches sources between
PR creation and merge. Previously the cooldown gate was load-bearing for
this window, not belt-and-suspenders as claimed.
BUG #2 — Second null-result path (line 573, triggered when enrichments
existed but all targets were missing in worktree) now updates DB. Without
this, that path created no PR, no DB mark, and would have re-entered the
runaway loop 4h later when the cooldown window expired.
NIT #6 — 4h cooldown moved to config.EXTRACTION_COOLDOWN_HOURS. Tunable
without code change. Log format now shows the configured hours.
Also backfilled 59 pre-existing zombie queue-path rows where the file
was already archived but DB status said 'unprocessed' — these would have
leaked past the DB filter once the 4h cooldown expired.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
469cb7f2da
commit
97b590acd6
2 changed files with 35 additions and 6 deletions
|
|
@ -200,6 +200,9 @@ MERGE_INTERVAL = 30
|
|||
FIX_INTERVAL = 60
|
||||
HEALTH_CHECK_INTERVAL = 60
|
||||
|
||||
# --- Extraction gates ---
|
||||
EXTRACTION_COOLDOWN_HOURS = 4 # Skip sources with any PR activity in this window. Defense-in-depth for DB-status filter.
|
||||
|
||||
# --- Retrieval (Telegram bot) ---
|
||||
RETRIEVAL_RRF_K = 20 # RRF smoothing constant — tuned for 5-10 results per source
|
||||
RETRIEVAL_ENTITY_BOOST = 1.5 # RRF score multiplier for claims wiki-linked from matched entities
|
||||
|
|
|
|||
|
|
@ -572,6 +572,18 @@ async def _extract_one_source(
|
|||
|
||||
if not files_written:
|
||||
logger.info("No files written for %s — cleaning up", source_file)
|
||||
# Path B null-result: enrichments existed but all targets missing in worktree.
|
||||
# No PR, no cooldown match — without DB update this re-extracts every 60s.
|
||||
# (Ganymede review, commit 469cb7f follow-up.)
|
||||
try:
|
||||
conn.execute(
|
||||
"""INSERT INTO sources (path, status, updated_at) VALUES (?, 'null_result', datetime('now'))
|
||||
ON CONFLICT(path) DO UPDATE SET status='null_result', updated_at=datetime('now')""",
|
||||
(source_path,),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception:
|
||||
logger.debug("Failed to mark source as null_result (path B)", exc_info=True)
|
||||
await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
|
||||
await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
|
||||
await _archive_source(source_path, domain, "null-result")
|
||||
|
|
@ -688,6 +700,19 @@ async def _extract_one_source(
|
|||
for c in claims_raw if c.get("title") or c.get("filename")
|
||||
)
|
||||
|
||||
# Success path: mark source as 'extracting' so queue scan's DB-status filter
|
||||
# skips it between PR creation and merge. Without this, cooldown is load-bearing
|
||||
# (Ganymede review, commit 469cb7f follow-up).
|
||||
try:
|
||||
conn.execute(
|
||||
"""INSERT INTO sources (path, status, updated_at) VALUES (?, 'extracting', datetime('now'))
|
||||
ON CONFLICT(path) DO UPDATE SET status='extracting', updated_at=datetime('now')""",
|
||||
(source_path,),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception:
|
||||
logger.debug("Failed to mark source as extracting", exc_info=True)
|
||||
|
||||
# Upsert: if discover_external_prs already created the row, update it;
|
||||
# if not, create a partial row that discover will complete.
|
||||
source_channel = classify_source_channel(branch)
|
||||
|
|
@ -931,16 +956,17 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
|
|||
if skipped:
|
||||
logger.info("Skipped %d source(s) with existing open PRs", skipped)
|
||||
|
||||
# Emergency cooldown: skip sources with ANY PR (merged/closed/open) in last 4h.
|
||||
# Prevents runaway re-extraction when archive step lags behind merge.
|
||||
# Root cause: source file remains in inbox/queue/ after PR merges until
|
||||
# _archive_source moves it — next daemon tick extracts the same source again.
|
||||
# Cooldown: skip sources with ANY PR in last EXTRACTION_COOLDOWN_HOURS.
|
||||
# Defense-in-depth for DB-status filter — catches the window between PR
|
||||
# creation and DB status update if anything races.
|
||||
if unprocessed:
|
||||
cooldown_hours = config.EXTRACTION_COOLDOWN_HOURS
|
||||
recent_source_paths = {
|
||||
r["source_path"] for r in conn.execute(
|
||||
"""SELECT DISTINCT source_path FROM prs
|
||||
WHERE source_path IS NOT NULL
|
||||
AND created_at > datetime('now', '-4 hours')"""
|
||||
AND created_at > datetime('now', ? || ' hours')""",
|
||||
(f"-{cooldown_hours}",),
|
||||
).fetchall() if r["source_path"]
|
||||
}
|
||||
if recent_source_paths:
|
||||
|
|
@ -951,7 +977,7 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
|
|||
]
|
||||
cooled = before - len(unprocessed)
|
||||
if cooled:
|
||||
logger.info("Cooldown: skipped %d source(s) with PRs in last 4h", cooled)
|
||||
logger.info("Cooldown: skipped %d source(s) with PRs in last %dh", cooled, cooldown_hours)
|
||||
|
||||
# ── Check for re-extraction sources (must run even when queue is empty) ──
|
||||
reextract_rows = conn.execute(
|
||||
|
|
|
|||
Loading…
Reference in a new issue