From dde055fdbf715bacafb0e92362b0a733d1e6c900 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sat, 25 Apr 2026 13:24:15 +0100 Subject: [PATCH] fix(metadao-scrape): STAT_BLEED word boundaries + min-render gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ganymede review on PR #6 (commit 800d1d8 → this commit): - WARNING: STAT_BLEED_RE false-positives on common words. The original pattern matched standalone stat-keyword tokens, clipping legitimate titles like "Engage with Pantera and Active Capital" → trimmed at " Active". Fix: require numeric/symbolic context (\$, +, -, \d) AFTER the stat-word, so word-only sequences pass through unchanged. - _clean_title_candidate now uses finditer + first-match-past-offset-10 instead of re.search. The DP-NNNNN digit sequence always wins leftmost position; we want the first POST-title bleed match instead. - NIT 3: minimum-render gate before write. Skip partial renders rather than archiving stubs whose downstream extraction null-results. Threshold: body < 500B AND no DP-N in title → skip and retry next run. Verified 10/10 on test grid: real bleed trimmed, mid-word false-positives preserved (Compass, Active Capital, Live Streaming, Encompass, Activate, Passage, Failure all pass through unchanged). NIT 1 (--headless no-op flag) and NIT 2 (futardio tag provenance noise): deferred — cosmetic, batch with future touch. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/metadao-scrape.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py index 7373686..8909882 100755 --- a/scripts/metadao-scrape.py +++ b/scripts/metadao-scrape.py @@ -173,16 +173,23 @@ DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILIN # Loose pattern: any line starting with DP-NNNNN followed by something. DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE) STAT_BLEED_RE = re.compile( - r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})", + # Stat keywords only bleed when followed by a numeric/symbolic stat token, + # so word-only sequences like "Active Capital" or "Live Streaming Service" pass. + r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)" + r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})", re.IGNORECASE, ) def _clean_title_candidate(line: str) -> str: line = line.strip() - bleed = STAT_BLEED_RE.search(line) - if bleed and bleed.start() > 10: # require some title before the bleed - line = line[: bleed.start()].rstrip(" :-—") + # Find first bleed match past offset 10. re.search returns leftmost, but the + # DP-NNNNN digit sequence always wins first place; we want the first POST-title + # match instead. Walk all matches and trim at the earliest one past the guard. + for bleed in STAT_BLEED_RE.finditer(line): + if bleed.start() > 10: + line = line[: bleed.start()].rstrip(" :-—") + break return line.strip()[:200] @@ -446,6 +453,15 @@ def main() -> int: if not proposal_data: continue + # Minimum-render gate: skip partial renders rather than archiving stubs. + # Successful captures are 20KB+; require either a real body or a DP-N title. + body_len = len(proposal_data.get("body_text") or "") + has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or "")) + if body_len < 500 and not has_dp_match: + log.warning(" skip (insufficient render): %s body=%dB title=%r", + addr, body_len, proposal_data.get("title", "")) + continue + fname = build_filename(slug, proposal_data, today) if Path(fname).stem in seen_basenames: