diff --git a/lib/extract.py b/lib/extract.py index a1e017f..c73e29f 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -923,6 +923,29 @@ async def extract_cycle(conn, max_workers=None) -> tuple[int, int]: except Exception: logger.debug("Failed to read source %s", f, exc_info=True) + # Archive-basename filter: skip queue files whose basename already exists in + # inbox/archive/. Research-session commits on agent branches occasionally + # re-introduce already-archived queue files when the branch is re-merged, + # producing same-source re-extractions every cooldown cycle. The archive + # copy is the source of truth — if a file with this basename is in archive, + # the source is processed regardless of queue state. Single archive scan + # per cycle, cheap (~1k files). + if unprocessed: + archive_dir = main / "inbox" / "archive" + archived_basenames: set[str] = set() + if archive_dir.exists(): + for af in archive_dir.rglob("*.md"): + archived_basenames.add(af.name) + if archived_basenames: + before = len(unprocessed) + unprocessed = [ + (sp, c, f) for sp, c, f in unprocessed + if Path(sp).name not in archived_basenames + ] + skipped = before - len(unprocessed) + if skipped: + logger.info("Skipped %d queue source(s) — basename already in inbox/archive/", skipped) + # Don't early-return here — re-extraction sources may exist even when queue is empty # (the re-extraction check runs after open-PR filtering below)