fix(metadao-scrape): YAML escape + URL regex + dry_run consistency
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run

Ganymede review on PR #6:
- WARNING: title and project["name"] flowed unescaped into YAML, would
  corrupt frontmatter on quote-bearing inputs (e.g. 'Adopt "Conservative"
  Pricing'). New _yaml_str helper routes free-text values through
  json.dumps (JSON strings are valid YAML strings). Applied to title,
  author, url, project_slug, proposal_address, proposal_status,
  squads_proposal, squads_status.
- NIT: URL_ADDR_RE didn't match new metadao.fi URLs — pattern segment
  couldn't span /projects/{slug}/proposal/. Added (?:/[^/...]*)*? for
  variable path depth. Verified against three URL shapes.
- NIT: dry_run key was omitted from JSON output on early --limit exit
  but present on normal exit. Trivial consistency fix.
- NIT (deferred): STAT_BLEED_RE protection is accidental rather than
  designed; only matters if MetaDAO breaks DP-NNNNN naming convention.
  Per Ganymede 'optional — current behavior fine.'

Verified: URL regex matches futard.io legacy + metadao.fi new + hypothetical
no-slug shapes. YAML escape survives embedded quotes, newlines, backslashes,
em-dashes.
This commit is contained in:
m3taversal 2026-04-25 13:19:06 +01:00
parent b8fba8195f
commit 800d1d8b8e

View file

@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str:
return s.strip("-")[:max_len].rstrip("-")
def _yaml_str(s: str) -> str:
"""Quote-safe YAML string. JSON strings are valid YAML strings."""
return json.dumps(s, ensure_ascii=False)
def existing_basenames(*dirs: Path) -> set[str]:
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
seen: set[str] = set()
@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]:
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})")
def existing_proposal_addresses(*dirs: Path) -> set[str]:
@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
body_text = (proposal.get("body_text") or "").strip()
decoded = proposal.get("decoded") or {}
# Build YAML frontmatter
# Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps).
# project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through
# the same path for consistency.
full_title = f"MetaDAO: {project['name']}{title}"
fm_lines = [
"---",
"type: source",
f'title: "MetaDAO: {project["name"]}{title}"',
'author: "metadao.fi"',
f'url: "{proposal["url"]}"',
f"title: {_yaml_str(full_title)}",
f"author: {_yaml_str('metadao.fi')}",
f"url: {_yaml_str(proposal['url'])}",
f"date: {today}",
"domain: internet-finance",
"format: data",
"status: unprocessed",
f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]",
"event_type: proposal",
f'project_slug: "{project["slug"]}"',
f'proposal_address: "{proposal["address"]}"',
f"project_slug: {_yaml_str(project['slug'])}",
f"proposal_address: {_yaml_str(proposal['address'])}",
]
if proposal.get("status"):
fm_lines.append(f'proposal_status: "{proposal["status"]}"')
fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}")
if decoded.get("squadsProposal"):
fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}")
if decoded.get("squadsStatus"):
fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}")
fm_lines.append("---")
fm_lines.append("")
@ -458,7 +466,7 @@ def main() -> int:
if args.limit and len(written) >= args.limit:
log.info("hit limit=%d, stopping", args.limit)
browser.close()
print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
return 0
browser.close()