From eee2b5c78bbbd6cc1ae8c1320d778cdabdd11e37 Mon Sep 17 00:00:00 2001
From: m3taversal <m3taversal@gmail.com>
Date: Wed, 1 Apr 2026 14:57:38 +0100
Subject: [PATCH] fix: 3 extraction bugs causing 0% approval rate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Remove hardcoded [[_map]] dead link from reconstruct_claim_content
2. Add source status check + archive move after extraction (prevents re-extraction loops)
3. Fix wiki link slug→space normalization (resolves hyphens to spaces before stripping)

Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887>
---
 batch-extract-50.sh      | 29 +++++++++++++++++++++++++++++
 lib/post_extract.py      | 24 +++++++++++++++++++-----
 openrouter-extract-v2.py |  2 +-
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/batch-extract-50.sh b/batch-extract-50.sh
index 924403c..cb31402 100755
--- a/batch-extract-50.sh
+++ b/batch-extract-50.sh
@@ -100,6 +100,13 @@ for SOURCE in $SOURCES; do
     BASENAME=$(basename "$SOURCE" .md)
     BRANCH="extract/$BASENAME"
 
+    # Skip already-processed sources (status set by extraction script, merged to main)
+    if grep -q "^status: processed\|^status: enrichment\|^status: null-result" "$SOURCE" 2>/dev/null; then
+        echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (already processed — status in frontmatter)" >> $LOG
+        SKIPPED=$((SKIPPED + 1))
+        continue
+    fi
+
     # Skip conversation archives — valuable content enters through standalone sources,
     # inline tags (SOURCE:/CLAIM:), and transcript review. Raw conversations produce
     # low-quality claims with schema failures. (Epimetheus session 4)
@@ -244,6 +251,28 @@ Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1
     SUCCESS=$((SUCCESS + 1))
     echo "  -> SUCCESS ($CHANGED files)" >> $LOG
 
+    # Move source from queue/ to archive/ on main worktree (prevents re-extraction)
+    DOMAIN=$(grep -m1 "^domain:" "$MAIN_REPO/inbox/queue/$BASENAME.md" 2>/dev/null | sed 's/^domain:\s*//')
+    ARCHIVE_DIR="$MAIN_REPO/inbox/archive/${DOMAIN:-uncategorized}"
+    mkdir -p "$ARCHIVE_DIR"
+    if [ -f "$MAIN_REPO/inbox/queue/$BASENAME.md" ]; then
+        mv "$MAIN_REPO/inbox/queue/$BASENAME.md" "$ARCHIVE_DIR/$BASENAME.md"
+        cd $MAIN_REPO
+        git add "inbox/queue/$BASENAME.md" "inbox/archive/" 2>/dev/null
+        git commit -m "pipeline: archive $BASENAME after extraction
+
+Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1
+        for attempt in 1 2 3; do
+            git pull --rebase origin main >> $LOG 2>&1
+            git push origin main >> $LOG 2>&1 && break
+            sleep 1
+        done
+        cd $REPO
+        git fetch origin main >> $LOG 2>&1
+        git reset --hard origin/main >> $LOG 2>&1
+        echo "  -> Archived $BASENAME to $ARCHIVE_DIR" >> $LOG
+    fi
+
     # Back to main
     git checkout -f main >> $LOG 2>&1
 
diff --git a/lib/post_extract.py b/lib/post_extract.py
index 7d033cb..7ce3aef 100644
--- a/lib/post_extract.py
+++ b/lib/post_extract.py
@@ -163,15 +163,29 @@ def fix_frontmatter(content: str, domain: str, agent: str) -> tuple[str, list[st
 
 
 def fix_wiki_links(content: str, existing_claims: set[str]) -> tuple[str, list[str]]:
-    """Strip brackets from broken wiki links, keeping the text. Returns (fixed_content, fixes)."""
+    """Fix or strip broken wiki links. Resolves slug→space mismatches before stripping.
+
+    The LLM often generates wiki links as slugs (hyphens) but KB filenames use spaces.
+    Try normalizing hyphens→spaces before giving up and stripping brackets.
+    """
     fixes = []
+    # Build a lookup: normalized (lowercased, hyphens→spaces) → original stem
+    _normalized_lookup: dict[str, str] = {}
+    for stem in existing_claims:
+        _normalized_lookup[stem.lower().replace("-", " ")] = stem
 
     def replace_broken(match):
         link = match.group(1).strip()
-        if link not in existing_claims:
-            fixes.append(f"stripped_wiki_link:{link[:60]}")
-            return link  # Keep text, remove brackets
-        return match.group(0)
+        if link in existing_claims:
+            return match.group(0)  # Exact match — keep as-is
+        # Try normalizing slug to spaces
+        normalized = link.lower().replace("-", " ")
+        if normalized in _normalized_lookup:
+            resolved = _normalized_lookup[normalized]
+            fixes.append(f"resolved_wiki_link:{link[:40]}->{resolved[:40]}")
+            return f"[[{resolved}]]"
+        fixes.append(f"stripped_wiki_link:{link[:60]}")
+        return link  # Keep text, remove brackets
 
     fixed = WIKI_LINK_RE.sub(replace_broken, content)
     return fixed, fixes
diff --git a/openrouter-extract-v2.py b/openrouter-extract-v2.py
index a7e7b24..6e50f24 100644
--- a/openrouter-extract-v2.py
+++ b/openrouter-extract-v2.py
@@ -262,7 +262,7 @@ def reconstruct_claim_content(claim, domain, agent):
     ]
     for r in related[:5]:
         lines.append(f"- [[{r}]]")
-    lines.extend(["", "Topics:", "- [[_map]]", ""])
+    lines.extend(["", "Topics:", ""])
     return "\n".join(lines)