From 923454c9ea19590e4fe291b3a54fab4f88527a77 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Thu, 30 Apr 2026 11:09:19 +0100 Subject: [PATCH] extract: document basename-uniqueness invariant + skip _-prefixed archive files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two nits from Ganymede review of ed4af4d: 1. Archive-basename filter depends on basename-uniqueness across queue+archive. Current naming (date-prefix + topic-slug) makes collisions rare, but if short generic names like "notes.md" enter the queue, the filter silently false-positives. Comment block names the assumption. 2. Archive walk now skips _-prefixed files, matching the standing convention everywhere else (search.py STRUCTURAL_FILES, reweave wiki-link skip, Layer 0 entity exclusion). Defensive — no _*.md exists under inbox/archive/ today, but consistent with codebase convention if a future operator drops _README.md to document the directory. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/extract.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/extract.py b/lib/extract.py index c73e29f..6f825f4 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -930,11 +930,18 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]: # copy is the source of truth — if a file with this basename is in archive, # the source is processed regardless of queue state. Single archive scan # per cycle, cheap (~1k files). + # + # Assumes basename uniqueness across queue+archive — current naming + # convention (date-prefix + topic-slug) makes collisions vanishingly + # rare. If short generic names like "notes.md" enter the queue, this + # filter silently false-positives. if unprocessed: archive_dir = main / "inbox" / "archive" archived_basenames: set[str] = set() if archive_dir.exists(): for af in archive_dir.rglob("*.md"): + if af.name.startswith("_"): + continue archived_basenames.add(af.name) if archived_basenames: before = len(unprocessed)