diff --git a/scripts/metadao-scrape.py b/scripts/metadao-scrape.py index 3d2c648..7373686 100755 --- a/scripts/metadao-scrape.py +++ b/scripts/metadao-scrape.py @@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str: return s.strip("-")[:max_len].rstrip("-") +def _yaml_str(s: str) -> str: + """Quote-safe YAML string. JSON strings are valid YAML strings.""" + return json.dumps(s, ensure_ascii=False) + + def existing_basenames(*dirs: Path) -> set[str]: """Collect all .md basenames (without extension) across the given dirs (recursive).""" seen: set[str] = set() @@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]: PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?") -URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})") +URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})") def existing_proposal_addresses(*dirs: Path) -> set[str]: @@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str: body_text = (proposal.get("body_text") or "").strip() decoded = proposal.get("decoded") or {} - # Build YAML frontmatter + # Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps). + # project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through + # the same path for consistency. + full_title = f"MetaDAO: {project['name']} — {title}" fm_lines = [ "---", "type: source", - f'title: "MetaDAO: {project["name"]} — {title}"', - 'author: "metadao.fi"', - f'url: "{proposal["url"]}"', + f"title: {_yaml_str(full_title)}", + f"author: {_yaml_str('metadao.fi')}", + f"url: {_yaml_str(proposal['url'])}", f"date: {today}", "domain: internet-finance", "format: data", "status: unprocessed", - f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]', + f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]", "event_type: proposal", - f'project_slug: "{project["slug"]}"', - f'proposal_address: "{proposal["address"]}"', + f"project_slug: {_yaml_str(project['slug'])}", + f"proposal_address: {_yaml_str(proposal['address'])}", ] if proposal.get("status"): - fm_lines.append(f'proposal_status: "{proposal["status"]}"') + fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}") if decoded.get("squadsProposal"): - fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"') + fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}") if decoded.get("squadsStatus"): - fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"') + fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}") fm_lines.append("---") fm_lines.append("") @@ -458,7 +466,7 @@ def main() -> int: if args.limit and len(written) >= args.limit: log.info("hit limit=%d, stopping", args.limit) browser.close() - print(json.dumps({"written": written, "skipped_existing": skipped_existing})) + print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run})) return 0 browser.close()