4 functions + 2 constants extracted to lib/frontmatter.py: - parse_yaml_frontmatter, union_edge_lists, serialize_edge_fields, serialize_frontmatter, REWEAVE_EDGE_FIELDS, RECIPROCAL_EDGE_MAP merge.py: 1678 → 1562 lines (−116). test_reweave_merge.py: replaced local function copies with imports from frontmatter.py — fixes missing challenged_by in test's REWEAVE_EDGE_FIELDS. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
133 lines
4.9 KiB
Python
133 lines
4.9 KiB
Python
"""Pure YAML frontmatter parsing and serialization for claim/entity files.
|
|
|
|
Shared by merge (reweave merge, reciprocal edges) and reweave scripts.
|
|
All functions are pure — zero I/O, zero async, zero DB.
|
|
|
|
Extracted from merge.py Phase 6 of decomposition (Ganymede-approved plan).
|
|
"""
|
|
|
|
import yaml
|
|
|
|
|
|
# Edge field names recognized in claim frontmatter.
|
|
# Order matters: serialize_edge_fields writes them in this order when appending new fields.
|
|
REWEAVE_EDGE_FIELDS = ("supports", "challenges", "challenged_by", "depends_on", "related", "reweave_edges")
|
|
|
|
# Reciprocal edge mapping: when A has edge_type → B, B gets reciprocal → A.
|
|
# When A supports B, B also supports A (approximately symmetric).
|
|
# When A challenges B, B is challenged_by A (NOT symmetric — direction matters).
|
|
RECIPROCAL_EDGE_MAP = {
|
|
"supports": "supports",
|
|
"challenges": "challenged_by",
|
|
"related": "related",
|
|
"depends_on": "related", # A depends_on B → B is related to A (not symmetric)
|
|
}
|
|
|
|
|
|
def parse_yaml_frontmatter(text: str) -> tuple[dict | None, str, str]:
|
|
"""Parse YAML frontmatter from markdown text.
|
|
|
|
Returns (frontmatter_dict, raw_fm_text, body_text_including_closing_delimiter).
|
|
Returns (None, "", text) if no valid frontmatter found.
|
|
raw_fm_text is the text between the --- delimiters (no delimiters, no leading newline).
|
|
"""
|
|
if not text.startswith("---"):
|
|
return None, "", text
|
|
end = text.find("\n---", 3)
|
|
if end == -1:
|
|
return None, "", text
|
|
try:
|
|
raw_fm_text = text[4:end] # skip "---\n", stop before "\n---"
|
|
fm = yaml.safe_load(raw_fm_text)
|
|
body = text[end:] # includes closing \n--- and body
|
|
return (fm if isinstance(fm, dict) else None), raw_fm_text, body
|
|
except Exception:
|
|
return None, "", text
|
|
|
|
|
|
def union_edge_lists(main_edges: list, branch_edges: list) -> list:
|
|
"""Union two edge lists, preserving order from main (append new at end).
|
|
|
|
Deduplicates by lowercase slug. Main's order is preserved; branch-only
|
|
edges are appended in their original order.
|
|
"""
|
|
seen = set()
|
|
result = []
|
|
for edge in main_edges:
|
|
key = str(edge).strip().lower()
|
|
if key not in seen:
|
|
seen.add(key)
|
|
result.append(edge)
|
|
for edge in branch_edges:
|
|
key = str(edge).strip().lower()
|
|
if key not in seen:
|
|
seen.add(key)
|
|
result.append(edge)
|
|
return result
|
|
|
|
|
|
def serialize_edge_fields(raw_fm_text: str, merged_edges: dict[str, list]) -> str:
|
|
"""Splice merged edge fields into raw frontmatter text, preserving all other fields byte-identical.
|
|
|
|
Only modifies REWEAVE_EDGE_FIELDS lines. All other frontmatter (title, confidence, type, etc.)
|
|
stays exactly as it was in the source text — no yaml.dump reformatting.
|
|
|
|
Args:
|
|
raw_fm_text: The raw YAML text between the --- delimiters (no delimiters included).
|
|
merged_edges: {field_name: [edge_values]} for each edge field that should be present.
|
|
"""
|
|
lines = raw_fm_text.split("\n")
|
|
result_lines = []
|
|
i = 0
|
|
fields_written = set()
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
# Check if this line starts an edge field
|
|
matched_field = None
|
|
for field in REWEAVE_EDGE_FIELDS:
|
|
if line.startswith(f"{field}:"):
|
|
matched_field = field
|
|
break
|
|
|
|
if matched_field:
|
|
fields_written.add(matched_field)
|
|
# Skip the old field and its list items (may be indented with spaces)
|
|
i += 1
|
|
while i < len(lines) and lines[i] and (lines[i][0] in (' ', '-')):
|
|
i += 1
|
|
# Write the merged version
|
|
edges = merged_edges.get(matched_field, [])
|
|
if edges:
|
|
result_lines.append(f"{matched_field}:")
|
|
for edge in edges:
|
|
result_lines.append(f"- {edge}")
|
|
# Don't increment i — it's already past the old field
|
|
continue
|
|
else:
|
|
result_lines.append(line)
|
|
i += 1
|
|
|
|
# Append any new edge fields that didn't exist in the original
|
|
for field in REWEAVE_EDGE_FIELDS:
|
|
if field not in fields_written:
|
|
edges = merged_edges.get(field, [])
|
|
if edges:
|
|
result_lines.append(f"{field}:")
|
|
for edge in edges:
|
|
result_lines.append(f"- {edge}")
|
|
|
|
return "\n".join(result_lines)
|
|
|
|
|
|
def serialize_frontmatter(raw_fm_text: str, merged_edges: dict[str, list], body: str) -> str:
|
|
"""Rebuild markdown file: splice merged edges into raw frontmatter, append body.
|
|
|
|
Uses string-level surgery — only edge fields are modified. All other frontmatter
|
|
stays byte-identical to the source. No yaml.dump reformatting.
|
|
"""
|
|
spliced = serialize_edge_fields(raw_fm_text, merged_edges)
|
|
# body starts with \n--- (closing delimiter + body text)
|
|
if body.startswith("\n"):
|
|
return f"---\n{spliced}{body}"
|
|
return f"---\n{spliced}\n{body}"
|