From c763c9991061f6ea3ffc00094e6dc9649dde052e Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 16 Apr 2026 14:04:49 +0100 Subject: [PATCH] fix: re-extraction loop runs even when queue is empty The re-extraction check was below an early return that fires when unprocessed queue is empty. Sources in needs_reextraction state were never picked up unless new sources happened to arrive simultaneously. Move re-extraction query above the gate so both paths run independently. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/extract.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/extract.py b/lib/extract.py index 17d3e7a..1da8515 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -776,10 +776,21 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]: if skipped: logger.info("Skipped %d source(s) with existing open PRs", skipped) - if not unprocessed: + # ── Check for re-extraction sources (must run even when queue is empty) ── + reextract_rows = conn.execute( + """SELECT path, feedback FROM sources + WHERE status = 'needs_reextraction' AND feedback IS NOT NULL + ORDER BY updated_at ASC LIMIT ?""", + (max(1, MAX_SOURCES - len(unprocessed)),), + ).fetchall() + + if not unprocessed and not reextract_rows: return 0, 0 - logger.info("Extract cycle: %d unprocessed source(s) found, processing up to %d", len(unprocessed), MAX_SOURCES) + if unprocessed: + logger.info("Extract cycle: %d unprocessed source(s) found, processing up to %d", len(unprocessed), MAX_SOURCES) + if reextract_rows: + logger.info("Extract cycle: %d source(s) queued for re-extraction", len(reextract_rows)) # Load existing claims for dedup existing_claims = load_existing_claims_from_repo(str(main)) @@ -792,14 +803,6 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]: total_ok = 0 total_err = 0 - # ── Re-extraction: pick up sources that failed eval and have feedback ── - reextract_rows = conn.execute( - """SELECT path, feedback FROM sources - WHERE status = 'needs_reextraction' AND feedback IS NOT NULL - ORDER BY updated_at ASC LIMIT ?""", - (max(1, MAX_SOURCES - len(unprocessed)),), - ).fetchall() - for row in reextract_rows: reex_path = row["path"] # Source was archived — read from archive location