tendril/kb/scripts/generate-index.sh

#!/bin/bash

# KB Index Generation Script
# Generates kb/_index.md with searchable metadata from all KB files

set -e

# Get the script directory and KB root directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
KB_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
INDEX_FILE="$KB_ROOT/_index.md"

echo "Generating KB index..."

# Create temporary files for indexing
TMP_DIR=$(mktemp -d 2>/dev/null || mktemp -d -t 'kb-index')
trap "rm -rf '$TMP_DIR'" EXIT

TOPICS_FILE="$TMP_DIR/topics.txt"
TAGS_FILE="$TMP_DIR/tags.txt"
PHASES_FILE="$TMP_DIR/phases.txt"
FILES_FILE="$TMP_DIR/files.txt"

touch "$TOPICS_FILE" "$TAGS_FILE" "$PHASES_FILE" "$FILES_FILE"

# Categories to scan
CATEGORIES=("01_projects" "02_systems" "03_research" "04_design" "05_decisions" "06_glossary" "07_playbooks" "08_archive")

# Function to extract frontmatter from a file
extract_frontmatter() {
    local file="$1"
    if [[ ! -f "$file" ]]; then
        return 1
    fi

    # Extract content between first --- and second ---
    awk '/^---$/{if(++count==2)exit} count==1' "$file" 2>/dev/null || echo ""
}

# Function to extract a YAML field value (simple fields)
extract_yaml_simple() {
    local frontmatter="$1"
    local field="$2"

    echo "$frontmatter" | grep "^${field}:" | sed "s/^${field}:[[:space:]]*//" | sed 's/^["'\'']//;s/["'\'']$//' | head -1
}

# Function to extract array values from YAML
extract_yaml_array() {
    local frontmatter="$1"
    local field="$2"

    # Try to extract array - handle both single-line and multi-line
    local array_content=$(echo "$frontmatter" | awk -v field="$field:" '
        BEGIN { in_array=0; found=0 }
        $0 ~ "^" field {
            found=1
            sub("^" field "[[:space:]]*", "")
            if ($0 ~ /\[.*\]/) {
                print $0
                exit
            }
            in_array=1
            next
        }
        in_array {
            if ($0 ~ /^[^[:space:]]/ && $0 !~ /^-/ && $0 !~ /^\[/) {
                in_array=0
                exit
            }
            if ($0 ~ /^-/ || $0 ~ /^\[/) {
                print $0
            }
        }
    ')

    # Extract values from array
    echo "$array_content" | grep -oE '["'\''][^"'\'']+["'\'']|[^, \[\]]+' | sed 's/^["'\'']//;s/["'\'']$//;s/^[[:space:]]*//;s/[[:space:]]*$//' | grep -v '^$' | grep -v '^\[' | grep -v '^\]'
}

# Function to process a KB file
process_kb_file() {
    local file="$1"
    local relative_path="${file#$KB_ROOT/}"
    local category=""

    # Determine category from path
    for cat in "${CATEGORIES[@]}"; do
        if [[ "$relative_path" == "$cat"/* ]]; then
            category="$cat"
            break
        fi
    done

    if [[ -z "$category" ]]; then
        return 0  # Skip files not in known categories
    fi

    # Extract frontmatter
    local frontmatter=$(extract_frontmatter "$file")
    if [[ -z "$frontmatter" ]]; then
        echo "Warning: No frontmatter found in $relative_path" >&2
        return 0
    fi

    # Extract metadata
    local title=$(extract_yaml_simple "$frontmatter" "title")
    local date=$(extract_yaml_simple "$frontmatter" "date")
    local type=$(extract_yaml_simple "$frontmatter" "type")
    local summary=$(extract_yaml_simple "$frontmatter" "summary")

    # Store file info
    echo "$category|$relative_path|$title|$date|$type|$summary" >> "$FILES_FILE"

    # Extract and index topics
    local topics=$(extract_yaml_array "$frontmatter" "topics")
    if [[ -n "$topics" ]]; then
        while IFS= read -r topic; do
            topic=$(echo "$topic" | xargs)
            if [[ -n "$topic" ]]; then
                echo "$topic|$relative_path" >> "$TOPICS_FILE"
            fi
        done <<< "$topics"
    fi

    # Extract and index tags
    local tags=$(extract_yaml_array "$frontmatter" "tags")
    if [[ -n "$tags" ]]; then
        while IFS= read -r tag; do
            tag=$(echo "$tag" | xargs)
            if [[ -n "$tag" ]]; then
                echo "$tag|$relative_path" >> "$TAGS_FILE"
            fi
        done <<< "$tags"
    fi

    # Extract and index phase relevance
    local phases=$(extract_yaml_array "$frontmatter" "phase_relevance")
    if [[ -n "$phases" ]]; then
        while IFS= read -r phase; do
            phase=$(echo "$phase" | xargs)
            if [[ -n "$phase" ]]; then
                echo "$phase|$relative_path" >> "$PHASES_FILE"
            fi
        done <<< "$phases"
    fi
}

# Scan all KB files
echo "Scanning KB files..."

for category in "${CATEGORIES[@]}"; do
    category_dir="$KB_ROOT/$category"
    if [[ ! -d "$category_dir" ]]; then
        continue
    fi

    # Find all .md files in category
    find "$category_dir" -type f -name "*.md" | while read -r file; do
        # Skip if in a special subdirectory
        if [[ "$file" == *"/_guides/"* ]] || \
           [[ "$file" == *"/_templates/"* ]] || \
           [[ "$file" == *"/_inbox/"* ]] || \
           [[ "$file" == *"/_review_queue/"* ]]; then
            continue
        fi

        # Check if filename matches KB pattern
        filename=$(basename "$file")
        if [[ "$filename" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}--[a-z0-9-]+--(idea|note|spec|decision|howto|retro|meeting)(--p[0-9]+)?\.md$ ]]; then
            process_kb_file "$file"
        fi
    done
done

# Count files
FILE_COUNT=$(wc -l < "$FILES_FILE" 2>/dev/null || echo "0")

# Generate index file
echo "Generating index file..."

{
    cat << EOF
# KB Index

_Last updated: $(date +%Y-%m-%d)_

This index is automatically generated from KB file metadata. It provides searchable access to all KB content organized by category, topic, tag, and phase relevance.

---

## File Listing by Category

EOF

    # Output files by category
    for category in "${CATEGORIES[@]}"; do
        category_files=$(grep "^$category|" "$FILES_FILE" 2>/dev/null || true)
        if [[ -n "$category_files" ]]; then
            echo "### $category"
            echo ""
            while IFS='|' read -r cat path title date type summary; do
                echo "- [\`$path\`]($path) - $title ($date, $type)"
            done <<< "$category_files"
            echo ""
        fi
    done

    # Topics Index
    if [[ -s "$TOPICS_FILE" ]]; then
        echo "## Topics Index"
        echo ""
        sort -u "$TOPICS_FILE" | cut -d'|' -f1 | sort -u | while read -r topic; do
            echo "### $topic"
            grep "^$topic|" "$TOPICS_FILE" | cut -d'|' -f2 | sort -u | while read -r file; do
                echo "- [\`$file\`]($file)"
            done
            echo ""
        done
    fi

    # Tags Index
    if [[ -s "$TAGS_FILE" ]]; then
        echo "## Tags Index"
        echo ""
        sort -u "$TAGS_FILE" | cut -d'|' -f1 | sort -u | while read -r tag; do
            echo "### $tag"
            grep "^$tag|" "$TAGS_FILE" | cut -d'|' -f2 | sort -u | while read -r file; do
                echo "- [\`$file\`]($file)"
            done
            echo ""
        done
    fi

    # Phase Relevance Index
    if [[ -s "$PHASES_FILE" ]]; then
        echo "## Phase Relevance Index"
        echo ""
        sort -u "$PHASES_FILE" | cut -d'|' -f1 | sort -u | while read -r phase; do
            echo "### $phase"
            grep "^$phase|" "$PHASES_FILE" | cut -d'|' -f2 | sort -u | while read -r file; do
                echo "- [\`$file\`]($file)"
            done
            echo ""
        done
    fi

    # Summary
    TOPIC_COUNT=$(cut -d'|' -f1 "$TOPICS_FILE" 2>/dev/null | sort -u | wc -l || echo "0")
    TAG_COUNT=$(cut -d'|' -f1 "$TAGS_FILE" 2>/dev/null | sort -u | wc -l || echo "0")
    PHASE_COUNT=$(cut -d'|' -f1 "$PHASES_FILE" 2>/dev/null | sort -u | wc -l || echo "0")

    echo "---"
    echo ""
    echo "## Summary"
    echo ""
    echo "- **Total KB Files**: $FILE_COUNT"
    echo "- **Unique Topics**: $TOPIC_COUNT"
    echo "- **Unique Tags**: $TAG_COUNT"
    echo "- **Phases Referenced**: $PHASE_COUNT"
    echo ""
    echo "_Index generated on $(date +%Y-%m-%d\ %H:%M:%S)_"

} > "$INDEX_FILE"

echo "Index generated successfully: $INDEX_FILE"
echo "  - Files indexed: $FILE_COUNT"
echo "  - Topics: $TOPIC_COUNT"
echo "  - Tags: $TAG_COUNT"
echo "  - Phases: $PHASE_COUNT"