From 800d1d8b8e9f0667ba890e0f31ccb19ed6b5ebf0 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Sat, 25 Apr 2026 13:19:06 +0100 Subject: [PATCH] fix(metadao-scrape): YAML escape + URL regex + dry_run consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ganymede review on PR #6: - WARNING: title and project["name"] flowed unescaped into YAML, would corrupt frontmatter on quote-bearing inputs (e.g. 'Adopt "Conservative" Pricing'). New _yaml_str helper routes free-text values through json.dumps (JSON strings are valid YAML strings). Applied to title, author, url, project_slug, proposal_address, proposal_status, squads_proposal, squads_status. - NIT: URL_ADDR_RE didn't match new metadao.fi URLs — pattern segment couldn't span /projects/{slug}/proposal/. Added (?:/[^/...]*)*? for variable path depth. Verified against three URL shapes. - NIT: dry_run key was omitted from JSON output on early --limit exit but present on normal exit. Trivial consistency fix. - NIT (deferred): STAT_BLEED_RE protection is accidental rather than designed; only matters if MetaDAO breaks DP-NNNNN naming convention. Per Ganymede 'optional — current behavior fine.' Verified: URL regex matches futard.io legacy + metadao.fi new + hypothetical no-slug shapes. YAML escape survives embedded quotes, newlines, backslashes, em-dashes. --- scripts/metadao-scrape.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py index 3d2c648..7373686 100755 --- a/scripts/metadao-scrape.py +++ b/scripts/metadao-scrape.py @@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str: return s.strip("-")[:max_len].rstrip("-") +def _yaml_str(s: str) -> str: + """Quote-safe YAML string. JSON strings are valid YAML strings.""" + return json.dumps(s, ensure_ascii=False) + + def existing_basenames(*dirs: Path) -> set[str]: """Collect all .md basenames (without extension) across the given dirs (recursive).""" seen: set[str] = set() @@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]: PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?") -URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})") +URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})") def existing_proposal_addresses(*dirs: Path) -> set[str]: @@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str: body_text = (proposal.get("body_text") or "").strip() decoded = proposal.get("decoded") or {} - # Build YAML frontmatter + # Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps). + # project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through + # the same path for consistency. + full_title = f"MetaDAO: {project['name']} — {title}" fm_lines = [ "---", "type: source", - f'title: "MetaDAO: {project["name"]} — {title}"', - 'author: "metadao.fi"', - f'url: "{proposal["url"]}"', + f"title: {_yaml_str(full_title)}", + f"author: {_yaml_str('metadao.fi')}", + f"url: {_yaml_str(proposal['url'])}", f"date: {today}", "domain: internet-finance", "format: data", "status: unprocessed", - f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]', + f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]", "event_type: proposal", - f'project_slug: "{project["slug"]}"', - f'proposal_address: "{proposal["address"]}"', + f"project_slug: {_yaml_str(project['slug'])}", + f"proposal_address: {_yaml_str(proposal['address'])}", ] if proposal.get("status"): - fm_lines.append(f'proposal_status: "{proposal["status"]}"') + fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}") if decoded.get("squadsProposal"): - fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"') + fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}") if decoded.get("squadsStatus"): - fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"') + fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}") fm_lines.append("---") fm_lines.append("") @@ -458,7 +466,7 @@ def main() -> int: if args.limit and len(written) >= args.limit: log.info("hit limit=%d, stopping", args.limit) browser.close() - print(json.dumps({"written": written, "skipped_existing": skipped_existing})) + print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run})) return 0 browser.close()