feat(ingestion): metadao.fi scraper to replace broken futard.io ingestion #6
1 changed files with 20 additions and 12 deletions
|
|
@ -57,6 +57,11 @@ def slugify(text: str, max_len: int = 60) -> str:
|
|||
return s.strip("-")[:max_len].rstrip("-")
|
||||
|
||||
|
||||
def _yaml_str(s: str) -> str:
|
||||
"""Quote-safe YAML string. JSON strings are valid YAML strings."""
|
||||
return json.dumps(s, ensure_ascii=False)
|
||||
|
||||
|
||||
def existing_basenames(*dirs: Path) -> set[str]:
|
||||
"""Collect all .md basenames (without extension) across the given dirs (recursive)."""
|
||||
seen: set[str] = set()
|
||||
|
|
@ -69,7 +74,7 @@ def existing_basenames(*dirs: Path) -> set[str]:
|
|||
|
||||
|
||||
PROP_ADDR_RE = re.compile(r"proposal_address:\s*[\"']?([A-Za-z0-9]{32,44})[\"']?")
|
||||
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)/[^/\s\"']*proposal/([A-Za-z0-9]{32,44})")
|
||||
URL_ADDR_RE = re.compile(r"(?:futard\.io|metadao\.fi)(?:/[^/\s\"']*)*?/proposal/([A-Za-z0-9]{32,44})")
|
||||
|
||||
|
||||
def existing_proposal_addresses(*dirs: Path) -> set[str]:
|
||||
|
|
@ -306,28 +311,31 @@ def build_source_markdown(project: dict, proposal: dict, today: str) -> str:
|
|||
body_text = (proposal.get("body_text") or "").strip()
|
||||
decoded = proposal.get("decoded") or {}
|
||||
|
||||
# Build YAML frontmatter
|
||||
# Build YAML frontmatter — all free-text values escaped via _yaml_str (json.dumps).
|
||||
# project_slug is constrained to [a-z0-9-] by slugify upstream, but pass through
|
||||
# the same path for consistency.
|
||||
full_title = f"MetaDAO: {project['name']} — {title}"
|
||||
fm_lines = [
|
||||
"---",
|
||||
"type: source",
|
||||
f'title: "MetaDAO: {project["name"]} — {title}"',
|
||||
'author: "metadao.fi"',
|
||||
f'url: "{proposal["url"]}"',
|
||||
f"title: {_yaml_str(full_title)}",
|
||||
f"author: {_yaml_str('metadao.fi')}",
|
||||
f"url: {_yaml_str(proposal['url'])}",
|
||||
f"date: {today}",
|
||||
"domain: internet-finance",
|
||||
"format: data",
|
||||
"status: unprocessed",
|
||||
f'tags: [futardio, metadao, futarchy, solana, governance, {project["slug"]}]',
|
||||
f"tags: [futardio, metadao, futarchy, solana, governance, {project['slug']}]",
|
||||
"event_type: proposal",
|
||||
f'project_slug: "{project["slug"]}"',
|
||||
f'proposal_address: "{proposal["address"]}"',
|
||||
f"project_slug: {_yaml_str(project['slug'])}",
|
||||
f"proposal_address: {_yaml_str(proposal['address'])}",
|
||||
]
|
||||
if proposal.get("status"):
|
||||
fm_lines.append(f'proposal_status: "{proposal["status"]}"')
|
||||
fm_lines.append(f"proposal_status: {_yaml_str(proposal['status'])}")
|
||||
if decoded.get("squadsProposal"):
|
||||
fm_lines.append(f'squads_proposal: "{decoded["squadsProposal"]}"')
|
||||
fm_lines.append(f"squads_proposal: {_yaml_str(decoded['squadsProposal'])}")
|
||||
if decoded.get("squadsStatus"):
|
||||
fm_lines.append(f'squads_status: "{decoded["squadsStatus"]}"')
|
||||
fm_lines.append(f"squads_status: {_yaml_str(decoded['squadsStatus'])}")
|
||||
fm_lines.append("---")
|
||||
fm_lines.append("")
|
||||
|
||||
|
|
@ -458,7 +466,7 @@ def main() -> int:
|
|||
if args.limit and len(written) >= args.limit:
|
||||
log.info("hit limit=%d, stopping", args.limit)
|
||||
browser.close()
|
||||
print(json.dumps({"written": written, "skipped_existing": skipped_existing}))
|
||||
print(json.dumps({"written": written, "skipped_existing": skipped_existing, "dry_run": args.dry_run}))
|
||||
return 0
|
||||
|
||||
browser.close()
|
||||
|
|
|
|||
Loading…
Reference in a new issue