From eee2b5c78bbbd6cc1ae8c1320d778cdabdd11e37 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Wed, 1 Apr 2026 14:57:38 +0100 Subject: [PATCH] fix: 3 extraction bugs causing 0% approval rate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Remove hardcoded [[_map]] dead link from reconstruct_claim_content 2. Add source status check + archive move after extraction (prevents re-extraction loops) 3. Fix wiki link slug→space normalization (resolves hyphens to spaces before stripping) Pentagon-Agent: Epimetheus <0144398e-4ed3-4fe2-95a3-3d72e1abf887> --- batch-extract-50.sh | 29 +++++++++++++++++++++++++++++ lib/post_extract.py | 24 +++++++++++++++++++----- openrouter-extract-v2.py | 2 +- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/batch-extract-50.sh b/batch-extract-50.sh index 924403c..cb31402 100755 --- a/batch-extract-50.sh +++ b/batch-extract-50.sh @@ -100,6 +100,13 @@ for SOURCE in $SOURCES; do BASENAME=$(basename "$SOURCE" .md) BRANCH="extract/$BASENAME" + # Skip already-processed sources (status set by extraction script, merged to main) + if grep -q "^status: processed\|^status: enrichment\|^status: null-result" "$SOURCE" 2>/dev/null; then + echo "[$(date)] [$COUNT/$MAX] SKIP $BASENAME (already processed — status in frontmatter)" >> $LOG + SKIPPED=$((SKIPPED + 1)) + continue + fi + # Skip conversation archives — valuable content enters through standalone sources, # inline tags (SOURCE:/CLAIM:), and transcript review. Raw conversations produce # low-quality claims with schema failures. (Epimetheus session 4) @@ -244,6 +251,28 @@ Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1 SUCCESS=$((SUCCESS + 1)) echo " -> SUCCESS ($CHANGED files)" >> $LOG + # Move source from queue/ to archive/ on main worktree (prevents re-extraction) + DOMAIN=$(grep -m1 "^domain:" "$MAIN_REPO/inbox/queue/$BASENAME.md" 2>/dev/null | sed 's/^domain:\s*//') + ARCHIVE_DIR="$MAIN_REPO/inbox/archive/${DOMAIN:-uncategorized}" + mkdir -p "$ARCHIVE_DIR" + if [ -f "$MAIN_REPO/inbox/queue/$BASENAME.md" ]; then + mv "$MAIN_REPO/inbox/queue/$BASENAME.md" "$ARCHIVE_DIR/$BASENAME.md" + cd $MAIN_REPO + git add "inbox/queue/$BASENAME.md" "inbox/archive/" 2>/dev/null + git commit -m "pipeline: archive $BASENAME after extraction + +Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>" >> $LOG 2>&1 + for attempt in 1 2 3; do + git pull --rebase origin main >> $LOG 2>&1 + git push origin main >> $LOG 2>&1 && break + sleep 1 + done + cd $REPO + git fetch origin main >> $LOG 2>&1 + git reset --hard origin/main >> $LOG 2>&1 + echo " -> Archived $BASENAME to $ARCHIVE_DIR" >> $LOG + fi + # Back to main git checkout -f main >> $LOG 2>&1 diff --git a/lib/post_extract.py b/lib/post_extract.py index 7d033cb..7ce3aef 100644 --- a/lib/post_extract.py +++ b/lib/post_extract.py @@ -163,15 +163,29 @@ def fix_frontmatter(content: str, domain: str, agent: str) -> tuple[str, list[st def fix_wiki_links(content: str, existing_claims: set[str]) -> tuple[str, list[str]]: - """Strip brackets from broken wiki links, keeping the text. Returns (fixed_content, fixes).""" + """Fix or strip broken wiki links. Resolves slug→space mismatches before stripping. + + The LLM often generates wiki links as slugs (hyphens) but KB filenames use spaces. + Try normalizing hyphens→spaces before giving up and stripping brackets. + """ fixes = [] + # Build a lookup: normalized (lowercased, hyphens→spaces) → original stem + _normalized_lookup: dict[str, str] = {} + for stem in existing_claims: + _normalized_lookup[stem.lower().replace("-", " ")] = stem def replace_broken(match): link = match.group(1).strip() - if link not in existing_claims: - fixes.append(f"stripped_wiki_link:{link[:60]}") - return link # Keep text, remove brackets - return match.group(0) + if link in existing_claims: + return match.group(0) # Exact match — keep as-is + # Try normalizing slug to spaces + normalized = link.lower().replace("-", " ") + if normalized in _normalized_lookup: + resolved = _normalized_lookup[normalized] + fixes.append(f"resolved_wiki_link:{link[:40]}->{resolved[:40]}") + return f"[[{resolved}]]" + fixes.append(f"stripped_wiki_link:{link[:60]}") + return link # Keep text, remove brackets fixed = WIKI_LINK_RE.sub(replace_broken, content) return fixed, fixes diff --git a/openrouter-extract-v2.py b/openrouter-extract-v2.py index a7e7b24..6e50f24 100644 --- a/openrouter-extract-v2.py +++ b/openrouter-extract-v2.py @@ -262,7 +262,7 @@ def reconstruct_claim_content(claim, domain, agent): ] for r in related[:5]: lines.append(f"- [[{r}]]") - lines.extend(["", "Topics:", "- [[_map]]", ""]) + lines.extend(["", "Topics:", ""]) return "\n".join(lines)