fix: close cooldown-dependence gaps in extract.py (Ganymede review)

Three targeted fixes from Ganymede's review of commit 469cb7f: BUG #1 — Success path now updates sources.status='extracting' before PR creation, so queue scan's DB-authoritative filter catches sources between PR creation and merge. Previously the cooldown gate was load-bearing for this window, not belt-and-suspenders as claimed. BUG #2 — Second null-result path (line 573, triggered when enrichments existed but all targets were missing in worktree) now updates DB. Without this, that path created no PR, no DB mark, and would have re-entered the runaway loop 4h later when the cooldown window expired. NIT #6 — 4h cooldown moved to config.EXTRACTION_COOLDOWN_HOURS. Tunable without code change. Log format now shows the configured hours. Also backfilled 59 pre-existing zombie queue-path rows where the file was already archived but DB status said 'unprocessed' — these would have leaked past the DB filter once the 4h cooldown expired. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 11:33:10 +01:00 · 2026-04-22 11:33:10 +01:00 · 97b590acd6
commit 97b590acd6
parent 469cb7f2da
2 changed files with 35 additions and 6 deletions
--- a/lib/config.py
+++ b/lib/config.py
@ -200,6 +200,9 @@ MERGE_INTERVAL = 30
 FIX_INTERVAL = 60
 HEALTH_CHECK_INTERVAL = 60

+# --- Extraction gates ---
+EXTRACTION_COOLDOWN_HOURS = 4  # Skip sources with any PR activity in this window. Defense-in-depth for DB-status filter.
+
 # --- Retrieval (Telegram bot) ---
 RETRIEVAL_RRF_K = 20  # RRF smoothing constant — tuned for 5-10 results per source
 RETRIEVAL_ENTITY_BOOST = 1.5  # RRF score multiplier for claims wiki-linked from matched entities
--- a/lib/extract.py
+++ b/lib/extract.py
@ -572,6 +572,18 @@ async def _extract_one_source(

    if not files_written:
        logger.info("No files written for %s — cleaning up", source_file)
+        # Path B null-result: enrichments existed but all targets missing in worktree.
+        # No PR, no cooldown match — without DB update this re-extracts every 60s.
+        # (Ganymede review, commit 469cb7f follow-up.)
+        try:
+            conn.execute(
+                """INSERT INTO sources (path, status, updated_at) VALUES (?, 'null_result', datetime('now'))
+                   ON CONFLICT(path) DO UPDATE SET status='null_result', updated_at=datetime('now')""",
+                (source_path,),
+            )
+            conn.commit()
+        except Exception:
+            logger.debug("Failed to mark source as null_result (path B)", exc_info=True)
        await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
        await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
        await _archive_source(source_path, domain, "null-result")
@ -688,6 +700,19 @@ async def _extract_one_source(
            for c in claims_raw if c.get("title") or c.get("filename")
        )

+        # Success path: mark source as 'extracting' so queue scan's DB-status filter
+        # skips it between PR creation and merge. Without this, cooldown is load-bearing
+        # (Ganymede review, commit 469cb7f follow-up).
+        try:
+            conn.execute(
+                """INSERT INTO sources (path, status, updated_at) VALUES (?, 'extracting', datetime('now'))
+                   ON CONFLICT(path) DO UPDATE SET status='extracting', updated_at=datetime('now')""",
+                (source_path,),
+            )
+            conn.commit()
+        except Exception:
+            logger.debug("Failed to mark source as extracting", exc_info=True)
+
        # Upsert: if discover_external_prs already created the row, update it;
        # if not, create a partial row that discover will complete.
        source_channel = classify_source_channel(branch)
@ -931,16 +956,17 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
        if skipped:
            logger.info("Skipped %d source(s) with existing open PRs", skipped)

-    # Emergency cooldown: skip sources with ANY PR (merged/closed/open) in last 4h.
-    # Prevents runaway re-extraction when archive step lags behind merge.
-    # Root cause: source file remains in inbox/queue/ after PR merges until
-    # _archive_source moves it — next daemon tick extracts the same source again.
+    # Cooldown: skip sources with ANY PR in last EXTRACTION_COOLDOWN_HOURS.
+    # Defense-in-depth for DB-status filter — catches the window between PR
+    # creation and DB status update if anything races.
    if unprocessed:
+        cooldown_hours = config.EXTRACTION_COOLDOWN_HOURS
        recent_source_paths = {
            r["source_path"] for r in conn.execute(
                """SELECT DISTINCT source_path FROM prs
                   WHERE source_path IS NOT NULL
-                   AND created_at > datetime('now', '-4 hours')"""
+                   AND created_at > datetime('now', ? || ' hours')""",
+                (f"-{cooldown_hours}",),
            ).fetchall() if r["source_path"]
        }
        if recent_source_paths:
@ -951,7 +977,7 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
            ]
            cooled = before - len(unprocessed)
            if cooled:
-                logger.info("Cooldown: skipped %d source(s) with PRs in last 4h", cooled)
+                logger.info("Cooldown: skipped %d source(s) with PRs in last %dh", cooled, cooldown_hours)

    # ── Check for re-extraction sources (must run even when queue is empty) ──
    reextract_rows = conn.execute(