From a053a8ebf9f9efb7419e1fbab4455ef5cd9ebaca Mon Sep 17 00:00:00 2001 From: m3taversal Date: Wed, 22 Apr 2026 21:29:33 +0100 Subject: [PATCH] fix(backfill): don't regress terminal source statuses to unprocessed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit backfill-sources.py runs every 15 minutes and derives sources.status purely from directory location. If a source file is in inbox/queue/, it blindly overwrites the DB status to 'unprocessed' — even when the DB already had 'extracted' or 'null_result'. This is why the 43 zombies kept coming back after manual backfill: cron re-reset them every 15 minutes, then each 4h cooldown expiry re-triggered runaway extraction on the same source. Fix: never regress from a terminal status (extracted, null_result, error, ghost_no_file) to 'unprocessed'. File location is ambiguous (legitimately new vs. zombie from failed archive); DB is authoritative. Legitimate re-extraction still works — it goes through the needs_reextraction path which is unaffected by this gate. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/backfill-sources.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/backfill-sources.py b/scripts/backfill-sources.py index 667d379..0dd08f5 100644 --- a/scripts/backfill-sources.py +++ b/scripts/backfill-sources.py @@ -104,14 +104,22 @@ def main(): claims_count = 0 if rel_path in existing: - # Update status if different + # Update status if different — but never regress from terminal states. + # If DB says 'extracted' or 'null_result' and file happens to be in queue/ + # (e.g., failed archive push, zombie file), the DB is authoritative. + # Downgrading to 'unprocessed' triggers the runaway re-extraction loop. current = conn.execute("SELECT status FROM sources WHERE path = ?", (rel_path,)).fetchone() + TERMINAL_STATUSES = {"extracted", "null_result", "error", "ghost_no_file"} if current and current["status"] != status: - conn.execute( - "UPDATE sources SET status = ?, updated_at = datetime('now') WHERE path = ?", - (status, rel_path), - ) - updated += 1 + if current["status"] in TERMINAL_STATUSES and status == "unprocessed": + # Don't regress terminal → unprocessed. DB wins. + pass + else: + conn.execute( + "UPDATE sources SET status = ?, updated_at = datetime('now') WHERE path = ?", + (status, rel_path), + ) + updated += 1 else: conn.execute( """INSERT INTO sources (path, status, priority, claims_count, created_at, updated_at)