diff --git a/lib/extract.py b/lib/extract.py index c73e29f..6f825f4 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -930,11 +930,18 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]: # copy is the source of truth — if a file with this basename is in archive, # the source is processed regardless of queue state. Single archive scan # per cycle, cheap (~1k files). + # + # Assumes basename uniqueness across queue+archive — current naming + # convention (date-prefix + topic-slug) makes collisions vanishingly + # rare. If short generic names like "notes.md" enter the queue, this + # filter silently false-positives. if unprocessed: archive_dir = main / "inbox" / "archive" archived_basenames: set[str] = set() if archive_dir.exists(): for af in archive_dir.rglob("*.md"): + if af.name.startswith("_"): + continue archived_basenames.add(af.name) if archived_basenames: before = len(unprocessed)