diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py index 7373686..8909882 100755 --- a/scripts/metadao-scrape.py +++ b/scripts/metadao-scrape.py @@ -173,16 +173,23 @@ DP_STRICT_RE = re.compile(r"DP-\d+\s*\([A-Z]+\)\s*[:\-]\s*[^\n\r]+", re.MULTILIN # Loose pattern: any line starting with DP-NNNNN followed by something. DP_LOOSE_RE = re.compile(r"DP-\d+\s*(?:\([A-Z]+\))?\s*[:\-]?\s*[^\n\r]+", re.MULTILINE) STAT_BLEED_RE = re.compile( - r"\s*(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT|\$|\+\d|-\d|\d+\.\d+%|\d{4,})", + # Stat keywords only bleed when followed by a numeric/symbolic stat token, + # so word-only sequences like "Active Capital" or "Live Streaming Service" pass. + r"\s+\b(?:Pass|Fail|Passed|Failed|Active|Pending|Ended|Live|TOTAL|VOLUME|STATUS|MCAP|PRICE|SPOT)\b\s+(?:\$|\+|-|\d)" + r"|\s*(?:\$\d|\+\d{2,}|\d+\.\d+%|\d{5,})", re.IGNORECASE, ) def _clean_title_candidate(line: str) -> str: line = line.strip() - bleed = STAT_BLEED_RE.search(line) - if bleed and bleed.start() > 10: # require some title before the bleed - line = line[: bleed.start()].rstrip(" :-—") + # Find first bleed match past offset 10. re.search returns leftmost, but the + # DP-NNNNN digit sequence always wins first place; we want the first POST-title + # match instead. Walk all matches and trim at the earliest one past the guard. + for bleed in STAT_BLEED_RE.finditer(line): + if bleed.start() > 10: + line = line[: bleed.start()].rstrip(" :-—") + break return line.strip()[:200] @@ -446,6 +453,15 @@ def main() -> int: if not proposal_data: continue + # Minimum-render gate: skip partial renders rather than archiving stubs. + # Successful captures are 20KB+; require either a real body or a DP-N title. + body_len = len(proposal_data.get("body_text") or "") + has_dp_match = bool(re.search(r"DP-\d+", proposal_data.get("title", "") or "")) + if body_len < 500 and not has_dp_match: + log.warning(" skip (insufficient render): %s body=%dB title=%r", + addr, body_len, proposal_data.get("title", "")) + continue + fname = build_filename(slug, proposal_data, today) if Path(fname).stem in seen_basenames: