fix(metadao-scrape): YAML escape + URL regex + dry_run consistency
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Some checks are pending
CI / lint-and-test (pull_request) Waiting to run
Ganymede review on PR #6: - WARNING: title and project["name"] flowed unescaped into YAML, would corrupt frontmatter on quote-bearing inputs (e.g. 'Adopt "Conservative" Pricing'). New _yaml_str helper routes free-text values through json.dumps (JSON strings are valid YAML strings). Applied to title, author, url, project_slug, proposal_address, proposal_status, squads_proposal, squads_status. - NIT: URL_ADDR_RE didn't match new metadao.fi URLs — pattern segment couldn't span /projects/{slug}/proposal/. Added (?:/[^/...]*)*? for variable path depth. Verified against three URL shapes. - NIT: dry_run key was omitted from JSON output on early --limit exit but present on normal exit. Trivial consistency fix. - NIT (deferred): STAT_BLEED_RE protection is accidental rather than designed; only matters if MetaDAO breaks DP-NNNNN naming convention. Per Ganymede 'optional — current behavior fine.' Verified: URL regex matches futard.io legacy + metadao.fi new + hypothetical no-slug shapes. YAML escape survives embedded quotes, newlines, backslashes, em-dashes.
This commit is contained in:
parent
b8fba8195f
commit
800d1d8b8e
1 changed files with 20 additions and 12 deletions
|
|
@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str:
|
||||||
return s.strip("-")[:max_len].rstrip("-")
|
return s.strip("-")[:max_len].rstrip("-")
|
||||||
|
|
||||||
|
|
||||||
|
def _yaml_str(s: str) -> str:
|
||||||
|
"""Quote-safe YAML string. JSON strings are valid YAML strings."""
|
||||||
|
return json.dumps(s, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def existing_basenames(*dirs: Path) -> set[str]:
|
def existing_basenames(*dirs: Path) -> set[str]:
|
||||||
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
|
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
|
|
@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]:
|
||||||
|
|
||||||
|
|
||||||
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
|
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
|
||||||
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
|
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})")
|
||||||
|
|
||||||
|
|
||||||
def existing_proposal_addresses(*dirs: Path) -> set[str]:
|
def existing_proposal_addresses(*dirs: Path) -> set[str]:
|
||||||
|
|
@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
|
||||||
body_text = (proposal.get("body_text") or "").strip()
|
body_text = (proposal.get("body_text") or "").strip()
|
||||||
decoded = proposal.get("decoded") or {}
|
decoded = proposal.get("decoded") or {}
|
||||||
|
|
||||||
# Build YAML frontmatter
|
# Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps).
|
||||||
|
# project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through
|
||||||
|
# the same path for consistency.
|
||||||
|
full_title = f"MetaDAO: {project['name']} — {title}"
|
||||||
fm_lines = [
|
fm_lines = [
|
||||||
"---",
|
"---",
|
||||||
"type: source",
|
"type: source",
|
||||||
f'title: "MetaDAO: {project["name"]} — {title}"',
|
f"title: {_yaml_str(full_title)}",
|
||||||
'author: "metadao.fi"',
|
f"author: {_yaml_str('metadao.fi')}",
|
||||||
f'url: "{proposal["url"]}"',
|
f"url: {_yaml_str(proposal['url'])}",
|
||||||
f"date: {today}",
|
f"date: {today}",
|
||||||
"domain: internet-finance",
|
"domain: internet-finance",
|
||||||
"format: data",
|
"format: data",
|
||||||
"status: unprocessed",
|
"status: unprocessed",
|
||||||
f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
|
f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]",
|
||||||
"event_type: proposal",
|
"event_type: proposal",
|
||||||
f'project_slug: "{project["slug"]}"',
|
f"project_slug: {_yaml_str(project['slug'])}",
|
||||||
f'proposal_address: "{proposal["address"]}"',
|
f"proposal_address: {_yaml_str(proposal['address'])}",
|
||||||
]
|
]
|
||||||
if proposal.get("status"):
|
if proposal.get("status"):
|
||||||
fm_lines.append(f'proposal_status: "{proposal["status"]}"')
|
fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}")
|
||||||
if decoded.get("squadsProposal"):
|
if decoded.get("squadsProposal"):
|
||||||
fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
|
fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}")
|
||||||
if decoded.get("squadsStatus"):
|
if decoded.get("squadsStatus"):
|
||||||
fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
|
fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}")
|
||||||
fm_lines.append("---")
|
fm_lines.append("---")
|
||||||
fm_lines.append("")
|
fm_lines.append("")
|
||||||
|
|
||||||
|
|
@ -458,7 +466,7 @@ def main() -> int:
|
||||||
if args.limit and len(written) >= args.limit:
|
if args.limit and len(written) >= args.limit:
|
||||||
log.info("hit limit=%d, stopping", args.limit)
|
log.info("hit limit=%d, stopping", args.limit)
|
||||||
browser.close()
|
browser.close()
|
||||||
print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
|
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
browser.close()
|
browser.close()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue