fix(metadao-scrape): STAT_BLEED word boundaries + min-render gate

Ganymede review on PR #6 (commit 800d1d8 → this commit): - WARNING: STAT_BLEED_RE false-positives on common words. The original pattern matched standalone stat-keyword tokens, clipping legitimate titles like "Engage with Pantera and Active Capital" → trimmed at " Active". Fix: require numeric/symbolic context (\$, +, -, \d) AFTER the stat-word, so word-only sequences pass through unchanged. - _clean_title_candidate now uses finditer + first-match-past-offset-10 instead of re.search. The DP-NNNNN digit sequence always wins leftmost position; we want the first POST-title bleed match instead. - NIT 3: minimum-render gate before write. Skip partial renders rather than archiving stubs whose downstream extraction null-results. Threshold: body < 500B AND no DP-N in title → skip and retry next run. Verified 10/10 on test grid: real bleed trimmed, mid-word false-positives preserved (Compass, Active Capital, Live Streaming, Encompass, Activate, Passage, Failure all pass through unchanged). NIT 1 (--headless no-op flag) and NIT 2 (futardio tag provenance noise): deferred — cosmetic, batch with future touch. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 13:24:15 +01:00 · 2026-04-25 13:24:15 +01:00 · dde055fdbf
commit dde055fdbf
parent 800d1d8b8e
1 changed files with 20 additions and 4 deletions
--- a/scripts/metadao-scrape.py
+++ b/scripts/metadao-scrape.py
@ -173,16 +173,23 @@ DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILIN
 # Loose pattern: any line starting with DP-NNNNN followed by something.
 DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE)
 STAT_BLEED_RE = re.compile(
-    r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})",
+    # Stat keywords only bleed when followed by a numeric/symbolic stat token,
+    # so word-only sequences like "Active Capital" or "Live Streaming Service" pass.
+    r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)"
+    r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})",
    re.IGNORECASE,
 )


 def _clean_title_candidate(line: str) -> str:
    line = line.strip()
-    bleed = STAT_BLEED_RE.search(line)
-    if bleed and bleed.start() > 10:  # require some title before the bleed
-        line = line[: bleed.start()].rstrip(" :-—")
+    # Find first bleed match past offset 10. re.search returns leftmost, but the
+    # DP-NNNNN digit sequence always wins first place; we want the first POST-title
+    # match instead. Walk all matches and trim at the earliest one past the guard.
+    for bleed in STAT_BLEED_RE.finditer(line):
+        if bleed.start() > 10:
+            line = line[: bleed.start()].rstrip(" :-—")
+            break
    return line.strip()[:200]


@ -446,6 +453,15 @@ def main() -> int:
                if not proposal_data:
                    continue

+                # Minimum-render gate: skip partial renders rather than archiving stubs.
+                # Successful captures are 20KB+; require either a real body or a DP-N title.
+                body_len = len(proposal_data.get("body_text") or "")
+                has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or ""))
+                if body_len < 500 and not has_dp_match:
+                    log.warning("  skip (insufficient render): %s body=%dB title=%r",
+                                addr, body_len, proposal_data.get("title", ""))
+                    continue
+
                fname = build_filename(slug, proposal_data, today)

                if Path(fname).stem in seen_basenames: