"""Shared Qdrant vector search library for the Teleo knowledge base. Provides embed + search + graph expansion as a reusable library. Any consumer (Argus dashboard, Telegram bot, agent research) imports from here. Layer 1: Qdrant vector search (semantic similarity) Layer 2: Graph expansion (1-hop via frontmatter edges) Layer 3: Left to the caller (agent context, domain filtering) Owner: Epimetheus """ import json import logging import os import re from pathlib import Path import urllib.request from . import config logger = logging.getLogger("pipeline.search") # --- Config (all from environment or config.py defaults) --- QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333") QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "teleo-claims") EMBEDDING_MODEL = "text-embedding-3-small" _OPENROUTER_KEY: str | None = None WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") # Structural files that should never be included in graph expansion results. # These are indexes/MOCs, not claims — expanding them pulls entire domains. STRUCTURAL_FILES = {"_map.md", "_overview.md"} def _get_api_key() -> str | None: """Load OpenRouter API key (cached after first read).""" global _OPENROUTER_KEY if _OPENROUTER_KEY: return _OPENROUTER_KEY key_file = config.SECRETS_DIR / "openrouter-key" if key_file.exists(): _OPENROUTER_KEY = key_file.read_text().strip() return _OPENROUTER_KEY _OPENROUTER_KEY = os.environ.get("OPENROUTER_API_KEY") return _OPENROUTER_KEY # --- Layer 1: Vector search --- def embed_query(text: str) -> list[float] | None: """Embed a query string via OpenRouter (OpenAI-compatible endpoint). Returns 1536-dim vector or None on failure. """ api_key = _get_api_key() if not api_key: logger.error("No OpenRouter API key available for embedding") return None payload = json.dumps({ "model": f"openai/{EMBEDDING_MODEL}", "input": text[:8000], }).encode() req = urllib.request.Request( "https://openrouter.ai/api/v1/embeddings", data=payload, headers={ "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", }, ) try: with urllib.request.urlopen(req, timeout=15) as resp: data = json.loads(resp.read()) return data["data"][0]["embedding"] except Exception as e: logger.error("Embedding failed: %s", e) return None def search_qdrant(vector: list[float], limit: int = 10, domain: str | None = None, confidence: str | None = None, exclude: list[str] | None = None, score_threshold: float = 0.3, offset: int = 0) -> list[dict]: """Search Qdrant collection for nearest claims. Args: offset: Skip first N results (Qdrant native offset for pagination). Returns list of hits: [{id, score, payload: {claim_path, claim_title, ...}}] """ must_filters = [] if domain: must_filters.append({"key": "domain", "match": {"value": domain}}) if confidence: must_filters.append({"key": "confidence", "match": {"value": confidence}}) must_not_filters = [] if exclude: for path in exclude: must_not_filters.append({"key": "claim_path", "match": {"value": path}}) body = { "vector": vector, "limit": limit, "with_payload": True, "score_threshold": score_threshold, } if offset > 0: body["offset"] = offset if must_filters or must_not_filters: body["filter"] = {} if must_filters: body["filter"]["must"] = must_filters if must_not_filters: body["filter"]["must_not"] = must_not_filters req = urllib.request.Request( f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search", data=json.dumps(body).encode(), headers={"Content-Type": "application/json"}, ) try: with urllib.request.urlopen(req, timeout=10) as resp: data = json.loads(resp.read()) return data.get("result", []) except Exception as e: logger.error("Qdrant search failed: %s", e) return [] # --- Layer 2: Graph expansion --- def _parse_frontmatter_edges(path: Path) -> dict: """Extract relationship edges from a claim's frontmatter. Handles both YAML formats: depends_on: ["item1", "item2"] (inline list) depends_on: (multi-line list) - item1 - item2 Returns {supports: [...], challenges: [...], depends_on: [...], related: [...], wiki_links: [...]}. wiki_links are separated from explicit related edges for differential weighting. """ edges = {"supports": [], "challenges": [], "depends_on": [], "related": [], "wiki_links": []} try: text = path.read_text(errors="replace") except Exception: return edges if not text.startswith("---"): return edges end = text.find("\n---", 3) if end == -1: return edges fm_text = text[3:end] # Use YAML parser for reliable edge extraction try: import yaml fm = yaml.safe_load(fm_text) if isinstance(fm, dict): for field in ("supports", "challenges", "depends_on", "related"): val = fm.get(field) if isinstance(val, list): edges[field] = [str(v).strip() for v in val if v] elif isinstance(val, str) and val.strip(): edges[field] = [val.strip()] except Exception: pass # Extract wiki links from body as separate edge type (lower weight) body = text[end + 4:] all_explicit = set() for field in ("supports", "challenges", "depends_on", "related"): all_explicit.update(edges[field]) wiki_links = WIKI_LINK_RE.findall(body) for link in wiki_links: link = link.strip() if link and link not in all_explicit and link not in edges["wiki_links"]: edges["wiki_links"].append(link) return edges def _resolve_claim_path(name: str, repo_root: Path) -> Path | None: """Resolve a claim name (from frontmatter edge or wiki link) to a file path. Handles both naming conventions: - "GLP-1 receptor agonists are..." → "GLP-1 receptor agonists are....md" (spaces) - "glp-1-persistence-drops..." → "glp-1-persistence-drops....md" (slugified) Checks domains/, core/, foundations/, decisions/ subdirectories. """ # Try exact name first (spaces in filename), then slugified candidates = [name] slug = name.lower().replace(" ", "-").replace("_", "-") if slug != name: candidates.append(slug) for subdir in ["domains", "core", "foundations", "decisions"]: base = repo_root / subdir if not base.is_dir(): continue for candidate_name in candidates: for md in base.rglob(f"{candidate_name}.md"): return md return None def graph_expand(seed_paths: list[str], repo_root: Path | None = None, max_expanded: int = 30, challenge_weight: float = 1.5, seen: set[str] | None = None) -> list[dict]: """Layer 2: Expand seed claims 1-hop through knowledge graph edges. Traverses supports/challenges/depends_on/related/wiki_links edges in frontmatter. Edge weights: challenges 1.5x, depends_on 1.25x, supports/related 1.0x, wiki_links 0.5x. Results sorted by weight descending so cap cuts low-value edges first. Args: seen: Optional set of paths already matched (e.g. from keyword search) to exclude. Returns list of {claim_path, claim_title, edge_type, edge_weight, from_claim}. Excludes claims already in seed_paths or seen set. """ EDGE_WEIGHTS = { "challenges": 1.5, "challenged_by": 1.5, "depends_on": 1.25, "supports": 1.0, "related": 1.0, "wiki_links": 0.5, } root = repo_root or config.MAIN_WORKTREE all_expanded = [] visited = set(seed_paths) if seen: visited.update(seen) for seed_path in seed_paths: full_path = root / seed_path if not full_path.exists(): continue edges = _parse_frontmatter_edges(full_path) for edge_type, targets in edges.items(): weight = EDGE_WEIGHTS.get(edge_type, 1.0) for target_name in targets: target_path = _resolve_claim_path(target_name, root) if target_path is None: continue rel_path = str(target_path.relative_to(root)) if rel_path in visited: continue # Skip structural files (MOCs/indexes) — they pull entire domains if target_path.name in STRUCTURAL_FILES: continue visited.add(rel_path) # Read title from frontmatter title = target_name try: text = target_path.read_text(errors="replace") if text.startswith("---"): end = text.find("\n---", 3) if end > 0: import yaml fm = yaml.safe_load(text[3:end]) if isinstance(fm, dict): title = fm.get("name", fm.get("title", target_name)) except Exception: pass all_expanded.append({ "claim_path": rel_path, "claim_title": str(title), "edge_type": edge_type, "edge_weight": weight, "from_claim": seed_path, }) # Sort by weight descending so cap cuts lowest-value edges first all_expanded.sort(key=lambda x: x["edge_weight"], reverse=True) return all_expanded[:max_expanded] # --- Combined search (Layer 1 + Layer 2) --- # Default thresholds — lowered Apr 5 after production audit showed 0 vector hits. # text-embedding-3-small scores 0.50-0.60 on conceptual matches (e.g. "risks in # investing" vs specific claims). 0.70 rejected every result. 0.50/0.40 lets # relevant claims through while still filtering noise. PASS1_LIMIT = 5 PASS1_THRESHOLD = 0.50 PASS2_LIMIT = 5 PASS2_THRESHOLD = 0.40 HARD_CAP = 10 def _dedup_hits(hits: list[dict], seen: set[str]) -> list[dict]: """Filter Qdrant hits: dedup by claim_path, exclude structural files.""" results = [] for hit in hits: payload = hit.get("payload", {}) claim_path = payload.get("claim_path", "") if claim_path in seen: continue if claim_path.split("/")[-1] in STRUCTURAL_FILES: continue seen.add(claim_path) results.append({ "claim_title": payload.get("claim_title", ""), "claim_path": claim_path, "score": round(hit.get("score", 0), 4), "domain": payload.get("domain", ""), "confidence": payload.get("confidence", ""), "snippet": payload.get("snippet", "")[:200], "type": payload.get("type", "claim"), }) return results def _sort_results(direct: list[dict], expanded: list[dict]) -> list[dict]: """Sort combined results: similarity desc → challenged_by → other expansion. Sort order is load-bearing: LLMs have primacy bias, so best claims first. """ # Direct results already sorted by Qdrant (cosine desc) sorted_direct = sorted(direct, key=lambda x: x.get("score", 0), reverse=True) # Expansion: challenged_by first (counterpoints), then rest by weight challenged = [e for e in expanded if e.get("edge_type") == "challenges"] other_expanded = [e for e in expanded if e.get("edge_type") != "challenges"] challenged.sort(key=lambda x: x.get("edge_weight", 0), reverse=True) other_expanded.sort(key=lambda x: x.get("edge_weight", 0), reverse=True) return sorted_direct + challenged + other_expanded def search(query: str, expand: bool = False, domain: str | None = None, confidence: str | None = None, exclude: list[str] | None = None) -> dict: """Two-pass semantic search: embed query, search Qdrant, optionally expand. Pass 1 (expand=False, default): Top 5 claims from Qdrant, score >= 0.70. Sufficient for ~80% of queries. Fast and focused. Pass 2 (expand=True): Next 5 claims (offset=5, score >= 0.60) plus graph-expanded claims (challenged_by, related edges). Hard cap 10 total. Agent calls this only when pass 1 didn't answer the question. Returns { "query": str, "direct_results": [...], # Layer 1 Qdrant hits (sorted by score desc) "expanded_results": [...], # Layer 2 graph expansion (challenges first) "total": int, } """ vector = embed_query(query) if vector is None: return {"query": query, "direct_results": [], "expanded_results": [], "total": 0, "error": "embedding_failed"} # --- Pass 1: Top 5, high threshold --- hits = search_qdrant(vector, limit=PASS1_LIMIT, domain=domain, confidence=confidence, exclude=exclude, score_threshold=PASS1_THRESHOLD) seen_paths: set[str] = set() if exclude: seen_paths.update(exclude) direct = _dedup_hits(hits, seen_paths) expanded = [] if expand: # --- Pass 2: Next 5 from Qdrant (lower threshold, offset) --- pass2_hits = search_qdrant(vector, limit=PASS2_LIMIT, domain=domain, confidence=confidence, exclude=exclude, score_threshold=PASS2_THRESHOLD, offset=PASS1_LIMIT) pass2_direct = _dedup_hits(pass2_hits, seen_paths) direct.extend(pass2_direct) # Graph expansion on all direct results (pass 1 + pass 2 seeds) seed_paths = [r["claim_path"] for r in direct] remaining_cap = HARD_CAP - len(direct) if remaining_cap > 0: expanded = graph_expand(seed_paths, max_expanded=remaining_cap, seen=seen_paths) # Enforce hard cap across all results all_sorted = _sort_results(direct, expanded)[:HARD_CAP] # Split back into direct vs expanded for backward compat direct_paths = {r["claim_path"] for r in direct} final_direct = [r for r in all_sorted if r.get("claim_path") in direct_paths] final_expanded = [r for r in all_sorted if r.get("claim_path") not in direct_paths] return { "query": query, "direct_results": final_direct, "expanded_results": final_expanded, "total": len(all_sorted), } # --- Duplicate detection --- def check_duplicate(text: str, threshold: float = 0.85, domain: str | None = None) -> dict: """Check if a claim/text is a near-duplicate of existing KB content. Embeds the text, searches Qdrant, returns top-3 matches with scores. Thresholds: >=0.85 likely duplicate, 0.70-0.85 check manually, <0.70 novel. Args: text: The claim text to check. threshold: Minimum score to flag as potential duplicate (default 0.85). domain: Optional domain filter. Returns: { "query": str, "is_duplicate": bool, # True if any match >= threshold "highest_score": float, # Best match score "verdict": str, # "duplicate" | "check_manually" | "novel" "matches": [ # Top 3 matches {"score": float, "claim_path": str, "claim_title": str, "domain": str} ] } """ vector = embed_query(text) if vector is None: return {"query": text[:100], "is_duplicate": False, "highest_score": 0, "verdict": "error", "matches": [], "error": "embedding_failed"} hits = search_qdrant(vector, limit=3, domain=domain, score_threshold=0.3) matches = [] for hit in hits: payload = hit.get("payload", {}) matches.append({ "score": round(hit.get("score", 0), 4), "claim_path": payload.get("claim_path", ""), "claim_title": payload.get("claim_title", ""), "domain": payload.get("domain", ""), }) highest = matches[0]["score"] if matches else 0.0 if highest >= threshold: verdict = "duplicate" elif highest >= 0.70: verdict = "check_manually" else: verdict = "novel" return { "query": text[:100], "is_duplicate": highest >= threshold, "highest_score": highest, "verdict": verdict, "matches": matches, }