#!/bin/bash # Reconcile source archive status: mark sources as processed if claims already exist # Usage: ./reconcile-source-status.sh [--apply] # Default: dry-run (preview only) # --apply: actually modify files CODEX_DIR="/Users/coryabdalla/Pentagon/teleo-codex" ARCHIVE_DIR="$CODEX_DIR/inbox/archive" DOMAINS_DIR="$CODEX_DIR/domains" MODE="dry-run" [[ "${1:-}" == "--apply" ]] && MODE="apply" echo "=== Source Status Reconciliation ===" echo "Mode: $MODE" echo "" matched=0 null_result=0 skipped=0 already_ok=0 while read -r src; do # Only process unprocessed sources status=$(grep "^status:" "$src" 2>/dev/null | head -1 | sed 's/^status: *//') if [[ "$status" != "unprocessed" ]]; then already_ok=$((already_ok + 1)) continue fi url=$(grep "^url:" "$src" 2>/dev/null | head -1 | sed 's/^url: *"*//;s/"*$//') title=$(grep "^title:" "$src" 2>/dev/null | head -1 | sed 's/^title: *"*//;s/"*$//') fname=$(basename "$src") # Check 1: Is this a test/spam source? is_test=false if echo "$title" | grep -qiE "^(Futardio: )?test[ -]"; then is_test=true fi # Check 2: URL-based match — search for the unique URL identifier in claims url_matched=false if [[ -n "$url" ]]; then # Extract the unique hash/slug from the URL (the long alphanumeric key) url_key=$(echo "$url" | grep -oE '[A-Za-z0-9]{20,}' | tail -1 || true) if [[ -n "$url_key" ]]; then if grep -rq "$url_key" "$DOMAINS_DIR" 2>/dev/null; then url_matched=true fi fi # Also try the full URL domain+path if ! $url_matched; then # Try matching the last path segment path_seg=$(echo "$url" | grep -oE '[^/]+$' || true) if [[ -n "$path_seg" ]] && [[ ${#path_seg} -gt 10 ]]; then if grep -rq "$path_seg" "$DOMAINS_DIR" 2>/dev/null; then url_matched=true fi fi fi fi # Check 3: Title match — search for a distinctive part of the title in claim source: fields title_matched=false if [[ -n "$title" ]]; then # Strip "Futardio: " prefix and grab a distinctive portion clean_title=$(echo "$title" | sed 's/^Futardio: //') # Use first 30 chars as search key (enough to be distinctive) title_key=$(echo "$clean_title" | cut -c1-30) if [[ ${#title_key} -gt 8 ]]; then if grep -rqi "$title_key" "$DOMAINS_DIR" 2>/dev/null; then title_matched=true fi fi fi if $is_test; then echo " NULL-RESULT (test/spam): $fname" null_result=$((null_result + 1)) if [[ "$MODE" == "apply" ]]; then sed -i '' "s/^status: unprocessed/status: null-result/" "$src" if ! grep -q "^processed_by:" "$src"; then sed -i '' "/^status: null-result/a\\ processed_by: epimetheus-reconcile\\ processed_date: $(date +%Y-%m-%d)\\ notes: \"auto-reconciled: test/spam source\"" "$src" fi fi elif $url_matched || $title_matched; then match_type="" $url_matched && match_type="url" || true $title_matched && match_type="${match_type:+$match_type+}title" || true echo " PROCESSED ($match_type): $fname" matched=$((matched + 1)) if [[ "$MODE" == "apply" ]]; then sed -i '' "s/^status: unprocessed/status: processed/" "$src" if ! grep -q "^processed_by:" "$src"; then sed -i '' "/^status: processed/a\\ processed_by: epimetheus-reconcile\\ processed_date: $(date +%Y-%m-%d)\\ notes: \"auto-reconciled: claims found matching this source\"" "$src" fi fi else skipped=$((skipped + 1)) fi done < <(find "$ARCHIVE_DIR" -name "*.md" -type f) echo "" echo "=== Summary ===" echo "Already correct status: $already_ok" echo "Matched → processed: $matched" echo "Test/spam → null-result: $null_result" echo "Still unprocessed: $skipped" echo "Total archive files: $(find "$ARCHIVE_DIR" -name '*.md' -type f 2>/dev/null | wc -l | tr -d ' ')"