diff --git a/lib/extract.py b/lib/extract.py index b1d06f2..b54c522 100644 --- a/lib/extract.py +++ b/lib/extract.py @@ -235,7 +235,13 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No domain = claim.get("domain", "") title = claim.get("title", claim.get("filename", "").replace("-", " ").replace(".md", "")) description = claim.get("description", "") - confidence = claim.get("confidence", "experimental") + raw_confidence = claim.get("confidence", "experimental") + _CONFIDENCE_MAP = { + "proven": "proven", "likely": "likely", "experimental": "experimental", + "speculative": "speculative", "high": "likely", "medium": "experimental", + "low": "speculative", "very high": "proven", "moderate": "experimental", + } + confidence = _CONFIDENCE_MAP.get(raw_confidence.lower().strip(), "experimental") if isinstance(raw_confidence, str) else "experimental" source_ref = claim.get("source", "") body = claim.get("body", "") scope = claim.get("scope", "") @@ -252,8 +258,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No if target not in edge_fields[rel]: edge_fields[rel].append(target) for r in related_claims[:5]: - r_clean = r.replace(".md", "") - if r_clean not in edge_fields["related"]: + r_clean = r.replace(".md", "").strip("[]").strip() + if r_clean and r_clean not in edge_fields["related"]: edge_fields["related"].append(r_clean) edge_lines = [] @@ -301,6 +307,14 @@ def _build_entity_content(entity: dict, domain: str) -> str: description = entity.get("content", "") if description: + # Strip code fences the LLM may have wrapped the content in + description = description.strip() + if description.startswith("```"): + first_nl = description.find("\n") + if first_nl != -1: + description = description[first_nl + 1:] + if description.endswith("```"): + description = description[:-3].rstrip() return description name = entity.get("filename", "").replace("-", " ").replace(".md", "").title() @@ -435,6 +449,31 @@ async def _extract_one_source( content = _build_entity_content(e, domain) entity_files.append({"filename": filename, "domain": domain, "content": content}) + # 6.5. Pre-filter near-duplicates BEFORE post-extract validation + # Uses same SequenceMatcher threshold as tier0. Catches duplicates cheaply ($0) + # before they create PRs and burn eval cycles. + if claim_files and existing_claims: + from difflib import SequenceMatcher as _SM + _DEDUP_THRESHOLD = 0.85 + filtered = [] + for cf in claim_files: + title_lower = Path(cf["filename"]).stem.replace("-", " ").lower() + title_words = set(title_lower.split()[:6]) + is_dup = False + for existing in existing_claims: + existing_lower = existing.replace("-", " ").lower() + if len(title_words & set(existing_lower.split()[:6])) < 2: + continue + if _SM(None, title_lower, existing_lower).ratio() >= _DEDUP_THRESHOLD: + logger.info("Extract-dedup: skipping near-duplicate '%s' (matches '%s')", cf["filename"], existing) + is_dup = True + break + if not is_dup: + filtered.append(cf) + if len(filtered) < len(claim_files): + logger.info("Extract-dedup: filtered %d/%d near-duplicates", len(claim_files) - len(filtered), len(claim_files)) + claim_files = filtered + # 7. Post-extraction validation if claim_files: kept_claims, rejected_claims, stats = validate_and_fix_claims( diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py index e48bb5e..29f846c 100644 --- a/lib/extraction_prompt.py +++ b/lib/extraction_prompt.py @@ -119,6 +119,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th "These existing claims are topically related to this source. For each NEW claim you extract,", "check this list and specify connections in the `connections` array.\n", ] + high_sim = [] for i, pa in enumerate(prior_art[:10], 1): title = pa.get("claim_title", "untitled") path = pa.get("claim_path", "") @@ -128,7 +129,16 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})") if desc: pa_lines.append(f" {desc}") + if score >= 0.75: + high_sim.append(title) pa_lines.append("") + if high_sim: + pa_lines.append("**WARNING — HIGH SIMILARITY MATCHES (score >= 0.75):**") + pa_lines.append("The following existing claims are very similar to themes in this source.") + pa_lines.append("Do NOT extract new claims that restate these — use ENRICHMENT instead:") + for hs in high_sim: + pa_lines.append(f" - {hs}") + pa_lines.append("") connection_candidates = "\n".join(pa_lines) else: connection_candidates = "" diff --git a/lib/validate.py b/lib/validate.py index f064fb4..ef3be9b 100644 --- a/lib/validate.py +++ b/lib/validate.py @@ -140,7 +140,12 @@ def validate_schema(fm: dict) -> list[str]: valid_conf = schema.get("valid_confidence") confidence = fm.get("confidence") if valid_conf and confidence and confidence not in valid_conf: - violations.append(f"invalid_confidence:{confidence}") + # Common LLM aliases — normalize before failing + _CONFIDENCE_ALIASES = {"high": "likely", "medium": "experimental", "low": "speculative", "very high": "proven", "moderate": "experimental"} + if isinstance(confidence, str) and confidence.lower().strip() in _CONFIDENCE_ALIASES: + pass # Fixable by post-extract or fixer — don't gate on this + else: + violations.append(f"invalid_confidence:{confidence}") desc = fm.get("description") if isinstance(desc, str) and len(desc.strip()) < 10: @@ -550,6 +555,16 @@ def tier05_mechanical_check(diff: str, existing_claims: set[str] | None = None) is_new = filepath in new_files if is_new: + # Strip code fences — LLM agents sometimes wrap content in ```markdown or ```yaml + stripped = content.strip() + if stripped.startswith("```"): + first_nl = stripped.find("\n") + if first_nl != -1: + stripped = stripped[first_nl + 1:] + if stripped.endswith("```"): + stripped = stripped[:-3].strip() + content = stripped + fm, body = parse_frontmatter(content) if fm is None: issues.append("frontmatter_schema")