diff --git a/scripts/backfill-sources.py b/scripts/backfill-sources.py index 667d379..0dd08f5 100644 --- a/scripts/backfill-sources.py +++ b/scripts/backfill-sources.py @@ -104,14 +104,22 @@ def main(): claims_count = 0 if rel_path in existing: - # Update status if different + # Update status if different — but never regress from terminal states. + # If DB says 'extracted' or 'null_result' and file happens to be in queue/ + # (e.g., failed archive push, zombie file), the DB is authoritative. + # Downgrading to 'unprocessed' triggers the runaway re-extraction loop. current = conn.execute("SELECT status FROM sources WHERE path = ?", (rel_path,)).fetchone() + TERMINAL_STATUSES = {"extracted", "null_result", "error", "ghost_no_file"} if current and current["status"] != status: - conn.execute( - "UPDATE sources SET status = ?, updated_at = datetime('now') WHERE path = ?", - (status, rel_path), - ) - updated += 1 + if current["status"] in TERMINAL_STATUSES and status == "unprocessed": + # Don't regress terminal → unprocessed. DB wins. + pass + else: + conn.execute( + "UPDATE sources SET status = ?, updated_at = datetime('now') WHERE path = ?", + (status, rel_path), + ) + updated += 1 else: conn.execute( """INSERT INTO sources (path, status, priority, claims_count, created_at, updated_at)