diff --git a/lib/extract.py b/lib/extract.py
index b1d06f2..b54c522 100644
--- a/lib/extract.py
+++ b/lib/extract.py
@@ -235,7 +235,13 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
     domain = claim.get("domain", "")
     title = claim.get("title", claim.get("filename", "").replace("-", " ").replace(".md", ""))
     description = claim.get("description", "")
-    confidence = claim.get("confidence", "experimental")
+    raw_confidence = claim.get("confidence", "experimental")
+    _CONFIDENCE_MAP = {
+        "proven": "proven", "likely": "likely", "experimental": "experimental",
+        "speculative": "speculative", "high": "likely", "medium": "experimental",
+        "low": "speculative", "very high": "proven", "moderate": "experimental",
+    }
+    confidence = _CONFIDENCE_MAP.get(raw_confidence.lower().strip(), "experimental") if isinstance(raw_confidence, str) else "experimental"
     source_ref = claim.get("source", "")
     body = claim.get("body", "")
     scope = claim.get("scope", "")
@@ -252,8 +258,8 @@ def _build_claim_content(claim: dict, agent: str, source_format: str | None = No
             if target not in edge_fields[rel]:
                 edge_fields[rel].append(target)
     for r in related_claims[:5]:
-        r_clean = r.replace(".md", "")
-        if r_clean not in edge_fields["related"]:
+        r_clean = r.replace(".md", "").strip("[]").strip()
+        if r_clean and r_clean not in edge_fields["related"]:
             edge_fields["related"].append(r_clean)
 
     edge_lines = []
@@ -301,6 +307,14 @@ def _build_entity_content(entity: dict, domain: str) -> str:
     description = entity.get("content", "")
 
     if description:
+        # Strip code fences the LLM may have wrapped the content in
+        description = description.strip()
+        if description.startswith("```"):
+            first_nl = description.find("\n")
+            if first_nl != -1:
+                description = description[first_nl + 1:]
+        if description.endswith("```"):
+            description = description[:-3].rstrip()
         return description
 
     name = entity.get("filename", "").replace("-", " ").replace(".md", "").title()
@@ -435,6 +449,31 @@ async def _extract_one_source(
             content = _build_entity_content(e, domain)
             entity_files.append({"filename": filename, "domain": domain, "content": content})
 
+    # 6.5. Pre-filter near-duplicates BEFORE post-extract validation
+    # Uses same SequenceMatcher threshold as tier0. Catches duplicates cheaply ($0)
+    # before they create PRs and burn eval cycles.
+    if claim_files and existing_claims:
+        from difflib import SequenceMatcher as _SM
+        _DEDUP_THRESHOLD = 0.85
+        filtered = []
+        for cf in claim_files:
+            title_lower = Path(cf["filename"]).stem.replace("-", " ").lower()
+            title_words = set(title_lower.split()[:6])
+            is_dup = False
+            for existing in existing_claims:
+                existing_lower = existing.replace("-", " ").lower()
+                if len(title_words & set(existing_lower.split()[:6])) < 2:
+                    continue
+                if _SM(None, title_lower, existing_lower).ratio() >= _DEDUP_THRESHOLD:
+                    logger.info("Extract-dedup: skipping near-duplicate '%s' (matches '%s')", cf["filename"], existing)
+                    is_dup = True
+                    break
+            if not is_dup:
+                filtered.append(cf)
+        if len(filtered) < len(claim_files):
+            logger.info("Extract-dedup: filtered %d/%d near-duplicates", len(claim_files) - len(filtered), len(claim_files))
+        claim_files = filtered
+
     # 7. Post-extraction validation
     if claim_files:
         kept_claims, rejected_claims, stats = validate_and_fix_claims(
diff --git a/lib/extraction_prompt.py b/lib/extraction_prompt.py
index e48bb5e..29f846c 100644
--- a/lib/extraction_prompt.py
+++ b/lib/extraction_prompt.py
@@ -119,6 +119,7 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
             "These existing claims are topically related to this source. For each NEW claim you extract,",
             "check this list and specify connections in the `connections` array.\n",
         ]
+        high_sim = []
         for i, pa in enumerate(prior_art[:10], 1):
             title = pa.get("claim_title", "untitled")
             path = pa.get("claim_path", "")
@@ -128,7 +129,16 @@ Set `contributor_thesis_extractable: true` if you extracted the contributor's th
             pa_lines.append(f"{i}. **{title}** (`{filename}`, similarity: {score:.2f})")
             if desc:
                 pa_lines.append(f"   {desc}")
+            if score >= 0.75:
+                high_sim.append(title)
         pa_lines.append("")
+        if high_sim:
+            pa_lines.append("**WARNING — HIGH SIMILARITY MATCHES (score >= 0.75):**")
+            pa_lines.append("The following existing claims are very similar to themes in this source.")
+            pa_lines.append("Do NOT extract new claims that restate these — use ENRICHMENT instead:")
+            for hs in high_sim:
+                pa_lines.append(f"  - {hs}")
+            pa_lines.append("")
         connection_candidates = "\n".join(pa_lines)
     else:
         connection_candidates = ""
diff --git a/lib/validate.py b/lib/validate.py
index f064fb4..ef3be9b 100644
--- a/lib/validate.py
+++ b/lib/validate.py
@@ -140,7 +140,12 @@ def validate_schema(fm: dict) -> list[str]:
     valid_conf = schema.get("valid_confidence")
     confidence = fm.get("confidence")
     if valid_conf and confidence and confidence not in valid_conf:
-        violations.append(f"invalid_confidence:{confidence}")
+        # Common LLM aliases — normalize before failing
+        _CONFIDENCE_ALIASES = {"high": "likely", "medium": "experimental", "low": "speculative", "very high": "proven", "moderate": "experimental"}
+        if isinstance(confidence, str) and confidence.lower().strip() in _CONFIDENCE_ALIASES:
+            pass  # Fixable by post-extract or fixer — don't gate on this
+        else:
+            violations.append(f"invalid_confidence:{confidence}")
 
     desc = fm.get("description")
     if isinstance(desc, str) and len(desc.strip()) < 10:
@@ -550,6 +555,16 @@ def tier05_mechanical_check(diff: str, existing_claims: set[str] | None = None)
         is_new = filepath in new_files
 
         if is_new:
+            # Strip code fences — LLM agents sometimes wrap content in ```markdown or ```yaml
+            stripped = content.strip()
+            if stripped.startswith("```"):
+                first_nl = stripped.find("\n")
+                if first_nl != -1:
+                    stripped = stripped[first_nl + 1:]
+                if stripped.endswith("```"):
+                    stripped = stripped[:-3].strip()
+                content = stripped
+
             fm, body = parse_frontmatter(content)
             if fm is None:
                 issues.append("frontmatter_schema")