From dde055fdbf715bacafb0e92362b0a733d1e6c900 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Sat, 25 Apr 2026 13:24:15 +0100
Subject: [PATCH] fix(metadao-scrape): STAT_BLEED word boundaries + min-render
 gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ganymede review on PR #6 (commit 800d1d8 → this commit):

- WARNING: STAT_BLEED_RE false-positives on common words. The original
  pattern matched standalone stat-keyword tokens, clipping legitimate
  titles like "Engage with Pantera and Active Capital" → trimmed at
  " Active". Fix: require numeric/symbolic context (\$, +, -, \d) AFTER
  the stat-word, so word-only sequences pass through unchanged.

- _clean_title_candidate now uses finditer + first-match-past-offset-10
  instead of re.search. The DP-NNNNN digit sequence always wins leftmost
  position; we want the first POST-title bleed match instead.

- NIT 3: minimum-render gate before write. Skip partial renders rather
  than archiving stubs whose downstream extraction null-results.
  Threshold: body < 500B AND no DP-N in title → skip and retry next run.

Verified 10/10 on test grid: real bleed trimmed, mid-word false-positives
preserved (Compass, Active Capital, Live Streaming, Encompass, Activate,
Passage, Failure all pass through unchanged).

NIT 1 (--headless no-op flag) and NIT 2 (futardio tag provenance noise):
deferred — cosmetic, batch with future touch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/metadao-scrape.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py
index 7373686..8909882 100755
--- a/scripts/metadao-scrape.py
+++ b/scripts/metadao-scrape.py
@@ -173,16 +173,23 @@ DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILIN
 # Loose pattern: any line starting with DP-NNNNN followed by something.
 DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
 STAT_BLEED_RE = re.compile(
-    r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})",
+    # Stat keywords only bleed when followed by a numeric/symbolic stat token,
+    # so word-only sequences like "Active Capital" or "Live Streaming Service" pass.
+    r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)"
+    r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})",
     re.IGNORECASE,
 )
 
 
 def _clean_title_candidate(line: str) -> str:
     line = line.strip()
-    bleed = STAT_BLEED_RE.search(line)
-    if bleed and bleed.start() > 10:  # require some title before the bleed
-        line = line[: bleed.start()].rstrip(" :-—")
+    # Find first bleed match past offset 10. re.search returns leftmost, but the
+    # DP-NNNNN digit sequence always wins first place; we want the first POST-title
+    # match instead. Walk all matches and trim at the earliest one past the guard.
+    for bleed in STAT_BLEED_RE.finditer(line):
+        if bleed.start() > 10:
+            line = line[: bleed.start()].rstrip(" :-—")
+            break
     return line.strip()[:200]
 
 
@@ -446,6 +453,15 @@ def main() -> int:
                 if not proposal_data:
                     continue
 
+                # Minimum-render gate: skip partial renders rather than archiving stubs.
+                # Successful captures are 20KB+; require either a real body or a DP-N title.
+                body_len = len(proposal_data.get("body_text") or "")
+                has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or ""))
+                if body_len < 500 and not has_dp_match:
+                    log.warning("  skip (insufficient render): %s body=%dB title=%r",
+                                addr, body_len, proposal_data.get("title", ""))
+                    continue
+
                 fname = build_filename(slug, proposal_data, today)
 
                 if Path(fname).stem in seen_basenames: