From 97b590acd6a134618990cf3cd520534f8a8f5575 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Wed, 22 Apr 2026 11:33:10 +0100
Subject: [PATCH] fix: close cooldown-dependence gaps in extract.py (Ganymede
 review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three targeted fixes from Ganymede's review of commit 469cb7f:

BUG #1 — Success path now updates sources.status='extracting' before PR
creation, so queue scan's DB-authoritative filter catches sources between
PR creation and merge. Previously the cooldown gate was load-bearing for
this window, not belt-and-suspenders as claimed.

BUG #2 — Second null-result path (line 573, triggered when enrichments
existed but all targets were missing in worktree) now updates DB. Without
this, that path created no PR, no DB mark, and would have re-entered the
runaway loop 4h later when the cooldown window expired.

NIT #6 — 4h cooldown moved to config.EXTRACTION_COOLDOWN_HOURS. Tunable
without code change. Log format now shows the configured hours.

Also backfilled 59 pre-existing zombie queue-path rows where the file
was already archived but DB status said 'unprocessed' — these would have
leaked past the DB filter once the 4h cooldown expired.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/config.py  |  3 +++
 lib/extract.py | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/lib/config.py b/lib/config.py
index d5bbf7f..9f1b401 100644
--- a/lib/config.py
+++ b/lib/config.py
@@ -200,6 +200,9 @@ MERGE_INTERVAL = 30
 FIX_INTERVAL = 60
 HEALTH_CHECK_INTERVAL = 60
 
+# --- Extraction gates ---
+EXTRACTION_COOLDOWN_HOURS = 4  # Skip sources with any PR activity in this window. Defense-in-depth for DB-status filter.
+
 # --- Retrieval (Telegram bot) ---
 RETRIEVAL_RRF_K = 20  # RRF smoothing constant — tuned for 5-10 results per source
 RETRIEVAL_ENTITY_BOOST = 1.5  # RRF score multiplier for claims wiki-linked from matched entities
diff --git a/lib/extract.py b/lib/extract.py
index a1bdc78..a1e017f 100644
--- a/lib/extract.py
+++ b/lib/extract.py
@@ -572,6 +572,18 @@ async def _extract_one_source(
 
     if not files_written:
         logger.info("No files written for %s — cleaning up", source_file)
+        # Path B null-result: enrichments existed but all targets missing in worktree.
+        # No PR, no cooldown match — without DB update this re-extracts every 60s.
+        # (Ganymede review, commit 469cb7f follow-up.)
+        try:
+            conn.execute(
+                """INSERT INTO sources (path, status, updated_at) VALUES (?, 'null_result', datetime('now'))
+                   ON CONFLICT(path) DO UPDATE SET status='null_result', updated_at=datetime('now')""",
+                (source_path,),
+            )
+            conn.commit()
+        except Exception:
+            logger.debug("Failed to mark source as null_result (path B)", exc_info=True)
         await _git("checkout", "main", cwd=str(EXTRACT_WORKTREE))
         await _git("branch", "-D", branch, cwd=str(EXTRACT_WORKTREE))
         await _archive_source(source_path, domain, "null-result")
@@ -688,6 +700,19 @@ async def _extract_one_source(
             for c in claims_raw if c.get("title") or c.get("filename")
         )
 
+        # Success path: mark source as 'extracting' so queue scan's DB-status filter
+        # skips it between PR creation and merge. Without this, cooldown is load-bearing
+        # (Ganymede review, commit 469cb7f follow-up).
+        try:
+            conn.execute(
+                """INSERT INTO sources (path, status, updated_at) VALUES (?, 'extracting', datetime('now'))
+                   ON CONFLICT(path) DO UPDATE SET status='extracting', updated_at=datetime('now')""",
+                (source_path,),
+            )
+            conn.commit()
+        except Exception:
+            logger.debug("Failed to mark source as extracting", exc_info=True)
+
         # Upsert: if discover_external_prs already created the row, update it;
         # if not, create a partial row that discover will complete.
         source_channel = classify_source_channel(branch)
@@ -931,16 +956,17 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
         if skipped:
             logger.info("Skipped %d source(s) with existing open PRs", skipped)
 
-    # Emergency cooldown: skip sources with ANY PR (merged/closed/open) in last 4h.
-    # Prevents runaway re-extraction when archive step lags behind merge.
-    # Root cause: source file remains in inbox/queue/ after PR merges until
-    # _archive_source moves it — next daemon tick extracts the same source again.
+    # Cooldown: skip sources with ANY PR in last EXTRACTION_COOLDOWN_HOURS.
+    # Defense-in-depth for DB-status filter — catches the window between PR
+    # creation and DB status update if anything races.
     if unprocessed:
+        cooldown_hours = config.EXTRACTION_COOLDOWN_HOURS
         recent_source_paths = {
             r["source_path"] for r in conn.execute(
                 """SELECT DISTINCT source_path FROM prs
                    WHERE source_path IS NOT NULL
-                   AND created_at > datetime('now', '-4 hours')"""
+                   AND created_at > datetime('now', ? || ' hours')""",
+                (f"-{cooldown_hours}",),
             ).fetchall() if r["source_path"]
         }
         if recent_source_paths:
@@ -951,7 +977,7 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]:
             ]
             cooled = before - len(unprocessed)
             if cooled:
-                logger.info("Cooldown: skipped %d source(s) with PRs in last 4h", cooled)
+                logger.info("Cooldown: skipped %d source(s) with PRs in last %dh", cooled, cooldown_hours)
 
     # ── Check for re-extraction sources (must run even when queue is empty) ──
     reextract_rows = conn.execute(