#!/usr/bin/env python3
"""
QA Test Runner for Research Development Framework

Comprehensive test suite covering:
- Database connectivity and schema
- Pipeline components (ingest, chunk, embed, classify, cluster)
- API endpoints
- Document import folders
- Configuration loading
- Advanced RAG features: Re-ranking, GraphRAG, Semantic Chunking, Research Agent
- CLI: rdf CLI, fetch, diff, assess, outline, essay, write, status
- Writing workflows: Book workflow, Essay workflow, Validation, Polish

Usage:
    python run_tests.py                    # Run all tests
    python run_tests.py --module database  # Run specific module
    python run_tests.py --module rag       # Run Advanced RAG tests only
    python run_tests.py --module v4        # Run CLI tests
    python run_tests.py --module workflow  # Run Book workflow tests
    python run_tests.py --quick            # Quick smoke tests only
    python run_tests.py --report           # Generate HTML report
"""

import os
import sys
import json
import time
import argparse
import traceback
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Tuple

# Add parent directory to path for imports
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
sys.path.insert(0, str(PROJECT_DIR / 'pipeline'))
sys.path.insert(0, str(PROJECT_DIR / 'web'))
sys.path.insert(0, str(PROJECT_DIR))

# Test result tracking
class TestResults:
    def __init__(self):
        self.passed = 0
        self.failed = 0
        self.skipped = 0
        self.errors = []
        self.details = []
        self.start_time = None
        self.end_time = None

    def add_pass(self, name: str, message: str = ""):
        self.passed += 1
        self.details.append(('PASS', name, message))
        print(f"  [PASS] {name}")

    def add_fail(self, name: str, message: str):
        self.failed += 1
        self.details.append(('FAIL', name, message))
        self.errors.append((name, message))
        print(f"  [FAIL] {name}: {message}")

    def add_skip(self, name: str, reason: str):
        self.skipped += 1
        self.details.append(('SKIP', name, reason))
        print(f"  [SKIP] {name}: {reason}")

    def summary(self) -> str:
        total = self.passed + self.failed + self.skipped
        duration = (self.end_time - self.start_time) if self.end_time and self.start_time else 0
        return f"""
{'='*60}
TEST SUMMARY
{'='*60}
Total:   {total}
Passed:  {self.passed} ({100*self.passed/total:.1f}% if total > 0 else 0)
Failed:  {self.failed}
Skipped: {self.skipped}
Duration: {duration:.2f}s
{'='*60}
"""


def print_header(title: str):
    print(f"\n{'='*60}")
    print(f" {title}")
    print('='*60)


# =============================================================================
# TEST MODULE: Configuration
# =============================================================================
def test_configuration(results: TestResults):
    """Test configuration loading and validation."""
    print_header("CONFIGURATION TESTS")

    # Test 1: Config file exists
    try:
        config_file = PROJECT_DIR / 'config' / 'project.yaml'
        if config_file.exists():
            results.add_pass("Config file exists", str(config_file))
        else:
            results.add_fail("Config file exists", f"Not found: {config_file}")
    except Exception as e:
        results.add_fail("Config file exists", str(e))

    # Test 2: Config loads without error
    try:
        from config import get_config, PATHS, INTELLIGENCE_MODE
        results.add_pass("Config module imports")
    except Exception as e:
        results.add_fail("Config module imports", str(e))
        return  # Can't continue without config

    # Test 3: PATHS are valid
    try:
        required_paths = ['base', 'library', 'new_docs', 'incoming', 'organized']
        for path_name in required_paths:
            if path_name in PATHS:
                results.add_pass(f"PATHS['{path_name}'] defined")
            else:
                results.add_fail(f"PATHS['{path_name}'] defined", "Missing from PATHS dict")
    except Exception as e:
        results.add_fail("PATHS validation", str(e))

    # Test 4: NEW_DOCS folder exists
    try:
        new_docs_path = PATHS.get('new_docs')
        if new_docs_path and Path(new_docs_path).exists():
            results.add_pass("NEW_DOCS folder exists", str(new_docs_path))
        else:
            results.add_fail("NEW_DOCS folder exists", f"Path: {new_docs_path}")
    except Exception as e:
        results.add_fail("NEW_DOCS folder exists", str(e))

    # Test 5: Intelligence mode is valid
    try:
        valid_modes = ['auto', 'cloud', 'local', 'statistical']
        if INTELLIGENCE_MODE in valid_modes:
            results.add_pass(f"Intelligence mode valid", f"Mode: {INTELLIGENCE_MODE}")
        else:
            results.add_fail("Intelligence mode valid", f"Invalid: {INTELLIGENCE_MODE}")
    except Exception as e:
        results.add_fail("Intelligence mode valid", str(e))

    # Test 6: Environment files exist
    try:
        env_file = PROJECT_DIR / '.env'
        env_db_file = PROJECT_DIR / '.env.db'
        if env_file.exists():
            results.add_pass(".env file exists")
        else:
            results.add_fail(".env file exists", "Missing .env file")
        if env_db_file.exists():
            results.add_pass(".env.db file exists")
        else:
            results.add_fail(".env.db file exists", "Missing .env.db file")
    except Exception as e:
        results.add_fail("Environment files", str(e))


# =============================================================================
# TEST MODULE: Database
# =============================================================================
def test_database(results: TestResults):
    """Test database connectivity and schema."""
    print_header("DATABASE TESTS")

    # Test 1: Database connection
    try:
        from db_utils import get_db_connection, execute_query
        # get_db_connection is a context manager, test it properly
        with get_db_connection() as conn:
            if conn:
                results.add_pass("Database connection")
            else:
                results.add_fail("Database connection", "Connection returned None")
                return
    except Exception as e:
        results.add_fail("Database connection", str(e))
        return

    # Test 2: Core tables exist
    try:
        core_tables = ['documents', 'chunks', 'concepts', 'topics', 'processing_queue']
        for table in core_tables:
            result = execute_query(
                f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = %s)",
                (table,), fetch='one'
            )
            if result and result.get('exists'):
                results.add_pass(f"Table '{table}' exists")
            else:
                results.add_fail(f"Table '{table}' exists", "Table not found")
    except Exception as e:
        results.add_fail("Core tables check", str(e))

    # Test 3: v2.0 tables exist
    try:
        v2_tables = ['document_clusters', 'document_cluster_membership', 'chat_sessions', 'chat_messages']
        for table in v2_tables:
            result = execute_query(
                f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = %s)",
                (table,), fetch='one'
            )
            if result and result.get('exists'):
                results.add_pass(f"v2.0 Table '{table}' exists")
            else:
                results.add_skip(f"v2.0 Table '{table}' exists", "Run schema_updates_v2.sql")
    except Exception as e:
        results.add_fail("v2.0 tables check", str(e))

    # Test 4: pgvector extension
    try:
        result = execute_query(
            "SELECT EXISTS (SELECT FROM pg_extension WHERE extname = 'vector')",
            fetch='one'
        )
        if result and result.get('exists'):
            results.add_pass("pgvector extension installed")
        else:
            results.add_fail("pgvector extension installed", "Extension not found")
    except Exception as e:
        results.add_fail("pgvector extension", str(e))

    # Test 5: Basic query works
    try:
        result = execute_query("SELECT COUNT(*) as count FROM documents", fetch='one')
        if result is not None:
            results.add_pass(f"Query execution", f"Documents: {result.get('count', 0)}")
        else:
            results.add_fail("Query execution", "Query returned None")
    except Exception as e:
        results.add_fail("Query execution", str(e))


# =============================================================================
# TEST MODULE: Libraries
# =============================================================================
def test_libraries(results: TestResults):
    """Test that all required libraries are available."""
    print_header("LIBRARY TESTS")

    # Core libraries
    core_libs = [
        ('flask', 'Flask'),
        ('psycopg2', 'psycopg2'),
        ('yaml', 'PyYAML'),
        ('pypdf', 'pypdf'),
        ('docx', 'python-docx'),
    ]

    for module, name in core_libs:
        try:
            __import__(module)
            results.add_pass(f"Core: {name}")
        except ImportError:
            results.add_fail(f"Core: {name}", "Not installed")

    # Statistical tier libraries
    stat_libs = [
        ('sklearn', 'scikit-learn'),
        ('multi_rake', 'multi-rake'),
        ('yake', 'yake'),
        ('sumy', 'sumy'),
        ('nltk', 'nltk'),
    ]

    for module, name in stat_libs:
        try:
            __import__(module)
            results.add_pass(f"Statistical: {name}")
        except ImportError:
            results.add_fail(f"Statistical: {name}", "Not installed")

    # Cloud tier libraries
    cloud_libs = [
        ('openai', 'openai'),
        ('tiktoken', 'tiktoken'),
    ]

    for module, name in cloud_libs:
        try:
            __import__(module)
            results.add_pass(f"Cloud: {name}")
        except ImportError:
            results.add_skip(f"Cloud: {name}", "Optional for cloud tier")

    # NLTK data check
    try:
        import nltk
        try:
            nltk.data.find('tokenizers/punkt')
            results.add_pass("NLTK punkt data")
        except LookupError:
            results.add_fail("NLTK punkt data", "Run: nltk.download('punkt')")
    except Exception as e:
        results.add_skip("NLTK data check", str(e))


# =============================================================================
# TEST MODULE: Pipeline Components
# =============================================================================
def test_pipeline(results: TestResults):
    """Test pipeline component imports and basic functionality."""
    print_header("PIPELINE TESTS")

    # Test 1: Taxonomist import
    try:
        from taxonomist import Taxonomist
        results.add_pass("Taxonomist import")
    except Exception as e:
        results.add_fail("Taxonomist import", str(e))

    # Test 2: Taxonomist initialization (statistical mode)
    try:
        from taxonomist import Taxonomist
        tax = Taxonomist(mode='statistical')
        results.add_pass("Taxonomist init (statistical)")
    except Exception as e:
        results.add_fail("Taxonomist init (statistical)", str(e))

    # Test 3: SemanticClusterer import
    try:
        from cluster_documents import SemanticClusterer
        results.add_pass("SemanticClusterer import")
    except Exception as e:
        results.add_fail("SemanticClusterer import", str(e))

    # Test 4: SemanticClusterer initialization
    try:
        from cluster_documents import SemanticClusterer
        clusterer = SemanticClusterer(mode='statistical')
        results.add_pass("SemanticClusterer init (statistical)")
    except Exception as e:
        results.add_fail("SemanticClusterer init (statistical)", str(e))

    # Test 5: DocumentProcessor import
    try:
        from ingest_documents import DocumentProcessor
        results.add_pass("DocumentProcessor import")
    except Exception as e:
        results.add_fail("DocumentProcessor import", str(e))

    # Test 6: TextChunker import
    try:
        from chunk_documents import TextChunker
        results.add_pass("TextChunker import")
    except Exception as e:
        results.add_fail("TextChunker import", str(e))

    # Test 7: db_utils functions
    try:
        from db_utils import semantic_search, keyword_search, insert_document
        results.add_pass("db_utils functions import")
    except Exception as e:
        results.add_fail("db_utils functions import", str(e))


# =============================================================================
# TEST MODULE: Statistical Classification
# =============================================================================
def test_statistical_classification(results: TestResults):
    """Test statistical (offline) classification."""
    print_header("STATISTICAL CLASSIFICATION TESTS")

    try:
        from taxonomist import Taxonomist
        tax = Taxonomist(mode='statistical')
    except Exception as e:
        results.add_fail("Statistical Taxonomist init", str(e))
        return

    # Test document
    test_text = """
    The Philosophy of Freedom by Rudolf Steiner explores the nature of human consciousness
    and the relationship between thinking and freedom. Steiner argues that true freedom
    is achieved through the activity of pure thinking, where the individual rises above
    instinctual drives and social conditioning to act from self-determined ethical principles.
    This philosophical work examines epistemology, ethics, and the nature of human cognition.
    """

    # Test 1: Classification returns result
    try:
        result = tax.classify_document(test_text)
        if result and isinstance(result, dict):
            results.add_pass("Statistical classification runs")
        else:
            results.add_fail("Statistical classification runs", "No result returned")
    except Exception as e:
        results.add_fail("Statistical classification runs", str(e))
        return

    # Test 2: Has required fields
    try:
        required_fields = ['primary_category', 'specific_topics', 'key_concepts', 'confidence']
        for field in required_fields:
            if field in result:
                results.add_pass(f"Classification has '{field}'", str(result.get(field, ''))[:50])
            else:
                results.add_fail(f"Classification has '{field}'", "Field missing")
    except Exception as e:
        results.add_fail("Classification fields", str(e))

    # Test 3: Keywords extracted
    try:
        if result.get('key_concepts') or result.get('specific_topics'):
            keywords = result.get('key_concepts', []) + result.get('specific_topics', [])
            results.add_pass(f"Keywords extracted", f"Count: {len(keywords)}")
        else:
            results.add_fail("Keywords extracted", "No keywords found")
    except Exception as e:
        results.add_fail("Keywords extracted", str(e))


# =============================================================================
# TEST MODULE: Statistical Clustering
# =============================================================================
def test_statistical_clustering(results: TestResults):
    """Test statistical (TF-IDF) clustering."""
    print_header("STATISTICAL CLUSTERING TESTS")

    try:
        from cluster_documents import SemanticClusterer
        clusterer = SemanticClusterer(mode='statistical')
    except Exception as e:
        results.add_fail("Statistical Clusterer init", str(e))
        return

    # Test 1: TF-IDF initialization
    try:
        clusterer._init_tfidf()
        if clusterer.tfidf is not None:
            results.add_pass("TF-IDF vectorizer initialized")
        else:
            results.add_fail("TF-IDF vectorizer initialized", "tfidf is None")
    except Exception as e:
        results.add_fail("TF-IDF vectorizer initialized", str(e))

    # Test 2: Can load document texts (if documents exist)
    try:
        from db_utils import execute_query
        doc_count = execute_query("SELECT COUNT(*) as count FROM documents", fetch='one')
        if doc_count and doc_count.get('count', 0) > 0:
            doc_ids, vectors, features = clusterer.load_document_tfidf()
            if len(doc_ids) > 0:
                results.add_pass(f"TF-IDF vectors loaded", f"Docs: {len(doc_ids)}, Features: {len(features)}")
            else:
                results.add_skip("TF-IDF vectors loaded", "No documents in database")
        else:
            results.add_skip("TF-IDF vectors loaded", "No documents in database")
    except Exception as e:
        results.add_fail("TF-IDF vectors loaded", str(e))


# =============================================================================
# TEST MODULE: Cloud Classification (requires API key)
# =============================================================================
def test_cloud_classification(results: TestResults, api_key: str = None):
    """Test cloud (OpenAI) classification."""
    print_header("CLOUD CLASSIFICATION TESTS")

    if not api_key:
        results.add_skip("Cloud classification", "No API key provided")
        return

    # Temporarily set the API key in environment
    os.environ['OPENAI_API_KEY'] = api_key

    # Need to reimport/reload config to pick up the key
    try:
        import importlib
        import sys
        # Remove cached modules to force reload with new env
        modules_to_reload = [k for k in sys.modules.keys() if 'taxonomist' in k or 'config' in k]
        for mod in modules_to_reload:
            del sys.modules[mod]

        from taxonomist import Taxonomist
        tax = Taxonomist(mode='cloud')

        if tax.client is not None:
            results.add_pass("Cloud Taxonomist init", f"Client connected, model: {tax.model}")
        else:
            results.add_fail("Cloud Taxonomist init", "Client is None - API key not recognized")
            return
    except Exception as e:
        results.add_fail("Cloud Taxonomist init", str(e))
        return

    test_text = """
    Rudolf Steiner's lectures on education emphasize the importance of understanding
    child development through the lens of spiritual science. The curriculum should
    evolve with the child's consciousness, introducing subjects at the appropriate
    stage of cognitive and emotional development. This comprehensive approach integrates
    intellectual, artistic, and practical elements to nurture the whole child.
    """

    try:
        result = tax.classify_document(test_text)
        if result and 'llm' in str(result.get('classification_source', '')):
            results.add_pass("Cloud classification works", f"Category: {result.get('primary_category')}")
        elif result and result.get('primary_category'):
            # Even if it used statistical fallback, check if result is valid
            results.add_pass("Cloud classification works", f"Category: {result.get('primary_category')} (via {result.get('classification_source', 'unknown')})")
        else:
            results.add_fail("Cloud classification works", f"Source: {result.get('classification_source', 'unknown')}")
    except Exception as e:
        results.add_fail("Cloud classification works", str(e))


# =============================================================================
# TEST MODULE: API Endpoints
# =============================================================================
def test_api_endpoints(results: TestResults):
    """Test API endpoint availability (without running server)."""
    print_header("API ENDPOINT TESTS")

    try:
        sys.path.insert(0, str(PROJECT_DIR / 'web'))
        from app import app
        results.add_pass("Flask app imports")
    except Exception as e:
        results.add_fail("Flask app imports", str(e))
        return

    # Test endpoints exist
    endpoints_to_test = [
        ('GET', '/api/health'),
        ('GET', '/api/stats'),
        ('POST', '/api/search'),
        ('GET', '/api/documents'),
        ('GET', '/api/concepts'),
        ('GET', '/api/topics'),
        ('POST', '/api/chat'),
        ('POST', '/api/search/faceted'),
        ('GET', '/api/clusters'),
    ]

    with app.test_client() as client:
        # Test 1: Health endpoint
        try:
            response = client.get('/api/health')
            if response.status_code == 200:
                data = response.get_json()
                results.add_pass("/api/health returns 200", f"Mode: {data.get('intelligence', {}).get('mode', 'unknown')}")
            else:
                results.add_fail("/api/health returns 200", f"Status: {response.status_code}")
        except Exception as e:
            results.add_fail("/api/health", str(e))

        # Test 2: Stats endpoint
        try:
            response = client.get('/api/stats')
            if response.status_code == 200:
                results.add_pass("/api/stats returns 200")
            else:
                results.add_fail("/api/stats returns 200", f"Status: {response.status_code}")
        except Exception as e:
            results.add_fail("/api/stats", str(e))

        # Test 3: Search endpoint
        try:
            response = client.post('/api/search',
                json={'query': 'test', 'search_type': 'keyword', 'limit': 5})
            if response.status_code == 200:
                results.add_pass("/api/search (keyword)")
            else:
                results.add_fail("/api/search (keyword)", f"Status: {response.status_code}")
        except Exception as e:
            results.add_fail("/api/search", str(e))

        # Test 4: Chat endpoint
        try:
            response = client.post('/api/chat',
                json={'question': 'What is philosophy?', 'max_sources': 3})
            if response.status_code == 200:
                data = response.get_json()
                results.add_pass("/api/chat returns 200", f"Mode: {data.get('mode', 'unknown')}")
            else:
                results.add_fail("/api/chat returns 200", f"Status: {response.status_code}")
        except Exception as e:
            results.add_fail("/api/chat", str(e))

        # Test 5: Clusters endpoint
        try:
            response = client.get('/api/clusters')
            if response.status_code == 200:
                results.add_pass("/api/clusters returns 200")
            else:
                results.add_fail("/api/clusters returns 200", f"Status: {response.status_code}")
        except Exception as e:
            results.add_fail("/api/clusters", str(e))

        # Test 6: Faceted search
        try:
            response = client.post('/api/search/faceted',
                json={'query': 'test', 'filters': {}, 'limit': 5})
            if response.status_code == 200:
                results.add_pass("/api/search/faceted returns 200")
            else:
                results.add_fail("/api/search/faceted returns 200", f"Status: {response.status_code}")
        except Exception as e:
            results.add_fail("/api/search/faceted", str(e))


# =============================================================================
# TEST MODULE: v3.0 Schema Features
# =============================================================================
def test_v3_schema(results: TestResults):
    """Test v3.0 schema enhancements."""
    print_header("V3.0 SCHEMA TESTS")

    try:
        from db_utils import execute_query
    except Exception as e:
        results.add_fail("v3.0 Schema import", str(e))
        return

    # Test 1: Knowledge graph tables exist
    kg_tables = ['entities', 'entity_types', 'entity_relationships', 'relationship_types', 'entity_mentions']
    for table in kg_tables:
        try:
            result = execute_query(
                f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = %s)",
                (table,), fetch='one'
            )
            if result and result.get('exists'):
                results.add_pass(f"KG Table '{table}' exists")
            else:
                results.add_skip(f"KG Table '{table}' exists", "Run schema_updates_v3.sql")
        except Exception as e:
            results.add_fail(f"KG Table '{table}'", str(e))

    # Test 2: Document images table exists
    try:
        result = execute_query(
            "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'document_images')",
            fetch='one'
        )
        if result and result.get('exists'):
            results.add_pass("Document images table exists")
        else:
            results.add_skip("Document images table", "Run schema_updates_v3.sql")
    except Exception as e:
        results.add_fail("Document images table", str(e))

    # Test 3: Chunks have page columns
    try:
        result = execute_query(
            "SELECT column_name FROM information_schema.columns WHERE table_name = 'chunks' AND column_name IN ('page_start', 'page_end')",
            fetch='all'
        )
        if result and len(result) == 2:
            results.add_pass("Chunks have page columns")
        else:
            results.add_skip("Chunks page columns", "Run schema_updates_v3.sql")
    except Exception as e:
        results.add_fail("Chunks page columns", str(e))

    # Test 4: Documents have citation columns
    try:
        result = execute_query(
            "SELECT column_name FROM information_schema.columns WHERE table_name = 'documents' AND column_name = 'bibtex_key'",
            fetch='one'
        )
        if result:
            results.add_pass("Documents have bibtex_key column")
        else:
            results.add_skip("Documents bibtex_key", "Run schema_updates_v3.sql")
    except Exception as e:
        results.add_fail("Documents bibtex_key", str(e))

    # Test 5: format_citation function exists
    try:
        result = execute_query(
            "SELECT EXISTS (SELECT FROM pg_proc WHERE proname = 'format_citation')",
            fetch='one'
        )
        if result and result.get('exists'):
            results.add_pass("format_citation function exists")
        else:
            results.add_skip("format_citation function", "Run schema_updates_v3.sql")
    except Exception as e:
        results.add_fail("format_citation function", str(e))


# =============================================================================
# TEST MODULE: Knowledge Graph
# =============================================================================
def test_knowledge_graph(results: TestResults):
    """Test knowledge graph extraction."""
    print_header("KNOWLEDGE GRAPH TESTS")

    # Test 1: KnowledgeGraphExtractor import
    try:
        from knowledge_graph import KnowledgeGraphExtractor
        results.add_pass("KnowledgeGraphExtractor import")
    except Exception as e:
        results.add_fail("KnowledgeGraphExtractor import", str(e))
        return

    # Test 2: Statistical mode initialization
    try:
        extractor = KnowledgeGraphExtractor(mode='statistical')
        results.add_pass("KnowledgeGraphExtractor init (statistical)")
    except Exception as e:
        results.add_fail("KnowledgeGraphExtractor init", str(e))
        return

    # Test 3: Entity extraction
    test_text = """
    Rudolf Steiner developed anthroposophy as a spiritual science.
    His work was influenced by Johann Wolfgang von Goethe's scientific method.
    The Philosophy of Freedom explores the nature of human consciousness.
    """

    try:
        entities, relationships = extractor.extract(test_text)
        if entities:
            results.add_pass("Entity extraction", f"Found {len(entities)} entities")
        else:
            results.add_fail("Entity extraction", "No entities found")
    except Exception as e:
        results.add_fail("Entity extraction", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Citation Management
# =============================================================================
def test_citation_management(results: TestResults):
    """Test citation key generation and freezing."""
    print_header("CITATION MANAGEMENT TESTS (v3.0)")

    # Test 1: Citation manager import
    try:
        from citation_manager import (
            generate_citation_key,
            freeze_citation_key,
            unfreeze_citation_key
        )
        results.add_pass("Citation manager import")
    except Exception as e:
        results.add_fail("Citation manager import", str(e))
        return

    # Test 2: Citation key generation
    try:
        key = generate_citation_key(
            author="Rudolf Steiner",
            year=1894,
            title="The Philosophy of Freedom"
        )
        if key and "Steiner" in key and "1894" in key:
            results.add_pass("Citation key generation", f"Key: {key}")
        else:
            results.add_fail("Citation key generation", f"Invalid key: {key}")
    except Exception as e:
        results.add_fail("Citation key generation", str(e))

    # Test 3: Key generation with missing author
    try:
        key = generate_citation_key(
            title="Unknown Author Book",
            year=2000
        )
        if key:
            results.add_pass("Citation key (no author)", f"Key: {key}")
        else:
            results.add_fail("Citation key (no author)", "No key generated")
    except Exception as e:
        results.add_fail("Citation key (no author)", str(e))

    # Test 4: Check database columns for freezing
    try:
        from db_utils import execute_query
        result = execute_query(
            "SELECT column_name FROM information_schema.columns "
            "WHERE table_name = 'documents' AND column_name IN ('bibtex_key', 'bibtex_key_frozen')",
            fetch='all'
        )
        if result and len(result) >= 1:
            results.add_pass("Citation freezing columns exist", f"Found {len(result)} columns")
        else:
            results.add_skip("Citation freezing columns", "Run migration 004_v3_features.sql")
    except Exception as e:
        results.add_fail("Citation freezing columns", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Document Pinning
# =============================================================================
def test_document_pinning(results: TestResults):
    """Test document pinning functionality."""
    print_header("DOCUMENT PINNING TESTS (v3.0)")

    # Test 1: Document pinning import
    try:
        from document_pinning import (
            pin_document,
            unpin_document,
            get_pinned_documents,
            set_priority,
            boost_pinned_results
        )
        results.add_pass("Document pinning import")
    except Exception as e:
        results.add_fail("Document pinning import", str(e))
        return

    # Test 2: Check database columns
    try:
        from db_utils import execute_query
        result = execute_query(
            "SELECT column_name FROM information_schema.columns "
            "WHERE table_name = 'documents' AND column_name IN ('is_pinned', 'pin_priority')",
            fetch='all'
        )
        if result and len(result) >= 2:
            results.add_pass("Document pinning columns exist")
        else:
            results.add_skip("Document pinning columns", "Run migration 004_v3_features.sql")
    except Exception as e:
        results.add_fail("Document pinning columns", str(e))

    # Test 3: Get pinned documents (should work even if empty)
    try:
        pinned = get_pinned_documents()
        results.add_pass("Get pinned documents", f"Found {len(pinned)} pinned")
    except Exception as e:
        # Database may not have V3 migration applied
        if "column" in str(e).lower() and "does not exist" in str(e).lower():
            results.add_skip("Get pinned documents", "Run migration 004_v3_features.sql")
        else:
            results.add_fail("Get pinned documents", str(e))

    # Test 4: boost_pinned_results with mock data
    try:
        mock_results = [
            {'document_id': 'DOC_001', 'score': 0.8},
            {'document_id': 'DOC_002', 'score': 0.7},
        ]
        boosted = boost_pinned_results(mock_results, boost_factor=1.5)
        results.add_pass("Boost pinned results", f"Processed {len(boosted)} results")
    except Exception as e:
        # Database may not have V3 migration applied
        if "column" in str(e).lower() and "does not exist" in str(e).lower():
            results.add_skip("Boost pinned results", "Run migration 004_v3_features.sql")
        else:
            results.add_fail("Boost pinned results", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Gap Pinning
# =============================================================================
def test_gap_pinning(results: TestResults):
    """Test research gap pinning functionality."""
    print_header("GAP PINNING TESTS (v3.0)")

    # Test 1: Gap pinning import
    try:
        from gap_pinning import (
            add_gap,
            pin_gap,
            ignore_gap,
            fill_gap,
            list_gaps,
            get_gap_stats
        )
        results.add_pass("Gap pinning import")
    except Exception as e:
        results.add_fail("Gap pinning import", str(e))
        return

    # Test 2: Check research_gaps table
    try:
        from db_utils import execute_query
        result = execute_query(
            "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'research_gaps')",
            fetch='one'
        )
        if result and result.get('exists'):
            results.add_pass("research_gaps table exists")
        else:
            results.add_skip("research_gaps table", "Run migration 004_v3_features.sql")
            return
    except Exception as e:
        results.add_fail("research_gaps table", str(e))
        return

    # Test 3: List gaps (should work even if empty)
    try:
        gaps = list_gaps()
        results.add_pass("List gaps", f"Found {len(gaps)} gaps")
    except Exception as e:
        results.add_fail("List gaps", str(e))

    # Test 4: Get gap statistics
    try:
        stats = get_gap_stats()
        if isinstance(stats, dict):
            results.add_pass("Gap statistics", f"Keys: {list(stats.keys())}")
        else:
            results.add_fail("Gap statistics", "Invalid return type")
    except Exception as e:
        results.add_fail("Gap statistics", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Smart Context Selection
# =============================================================================
def test_context_selection(results: TestResults):
    """Test smart context selection for LLM synthesis."""
    print_header("CONTEXT SELECTION TESTS (v3.0)")

    # Test 1: Context selector import
    try:
        from context_selector import SmartContextSelector, estimate_tokens
        results.add_pass("Context selector import")
    except Exception as e:
        results.add_fail("Context selector import", str(e))
        return

    # Test 2: Token estimation
    try:
        test_text = "This is a test sentence for token estimation."
        tokens = estimate_tokens(test_text)
        if isinstance(tokens, int) and tokens > 0:
            results.add_pass("Token estimation", f"'{test_text[:20]}...' = {tokens} tokens")
        else:
            results.add_fail("Token estimation", f"Invalid result: {tokens}")
    except Exception as e:
        results.add_fail("Token estimation", str(e))

    # Test 3: SmartContextSelector initialization
    try:
        selector = SmartContextSelector(max_tokens=4000)
        results.add_pass("SmartContextSelector initialization")
    except Exception as e:
        results.add_fail("SmartContextSelector initialization", str(e))
        return

    # Test 4: Context selection with mock chunks
    try:
        mock_chunks = [
            {'chunk_id': 'C1', 'chunk_text': 'First chunk about philosophy.', 'document_id': 'DOC_1'},
            {'chunk_id': 'C2', 'chunk_text': 'Second chunk about science.', 'document_id': 'DOC_2'},
            {'chunk_id': 'C3', 'chunk_text': 'Third chunk about art.', 'document_id': 'DOC_3'},
        ]
        result = selector.select_context(mock_chunks, pinned_doc_ids=['DOC_1'])
        # Result is a ContextSelectionResult with .chunks attribute
        if hasattr(result, 'chunks'):
            results.add_pass("Context selection", f"Selected {len(result.chunks)} chunks")
        elif hasattr(result, '__len__'):
            results.add_pass("Context selection", f"Selected {len(result)} chunks")
        else:
            results.add_pass("Context selection", f"Result type: {type(result).__name__}")
    except Exception as e:
        results.add_fail("Context selection", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Semantic Graph
# =============================================================================
def test_semantic_graph(results: TestResults):
    """Test semantic graph traversal."""
    print_header("SEMANTIC GRAPH TESTS (v3.0)")

    # Test 1: Semantic graph import
    try:
        from semantic_graph import (
            SemanticGraph,
            ConceptNode,
            ConceptEdge,
            GraphPath,
            RELATIONSHIP_TYPES
        )
        results.add_pass("Semantic graph import")
    except Exception as e:
        results.add_fail("Semantic graph import", str(e))
        return

    # Test 2: Relationship types defined
    try:
        expected_types = ['related', 'supports', 'contradicts', 'influences', 'derived_from']
        found = [t for t in expected_types if t in RELATIONSHIP_TYPES]
        if len(found) >= 3:
            results.add_pass("Relationship types defined", f"Found: {', '.join(found)}")
        else:
            results.add_fail("Relationship types defined", f"Only found: {found}")
    except Exception as e:
        results.add_fail("Relationship types defined", str(e))

    # Test 3: SemanticGraph initialization
    try:
        graph = SemanticGraph()
        results.add_pass("SemanticGraph initialization")
    except Exception as e:
        results.add_fail("SemanticGraph initialization", str(e))
        return

    # Test 4: Check concept_relationships table
    try:
        from db_utils import execute_query
        result = execute_query(
            "SELECT column_name FROM information_schema.columns "
            "WHERE table_name = 'concept_relationships' AND column_name = 'relationship_type'",
            fetch='one'
        )
        if result:
            results.add_pass("Concept relationships enhanced")
        else:
            results.add_skip("Concept relationships", "Run migration 004_v3_features.sql")
    except Exception as e:
        results.add_fail("Concept relationships", str(e))

    # Test 5: ConceptNode creation
    try:
        node = ConceptNode(concept_id=1, name="Test Concept", category="philosophy")
        if node.name == "Test Concept":
            results.add_pass("ConceptNode creation")
        else:
            results.add_fail("ConceptNode creation", "Invalid node")
    except Exception as e:
        results.add_fail("ConceptNode creation", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Research Sessions
# =============================================================================
def test_research_sessions(results: TestResults):
    """Test interactive research session management."""
    print_header("RESEARCH SESSIONS TESTS (v3.0)")

    # Test 1: Research session import
    try:
        from research_session import (
            InteractiveResearchSession,
            SessionState,
            CheckpointType
        )
        results.add_pass("Research session import")
    except Exception as e:
        results.add_fail("Research session import", str(e))
        return

    # Test 2: Session states defined
    try:
        states = [
            SessionState.PLANNING,
            SessionState.SEARCHING,
            SessionState.COMPLETE,
            SessionState.CANCELLED,
            SessionState.ERROR
        ]
        results.add_pass("SessionState enum", f"States: {[s.value for s in states]}")
    except Exception as e:
        results.add_fail("SessionState enum", str(e))

    # Test 3: Checkpoint types defined
    try:
        checkpoint_types = [
            CheckpointType.PLAN_READY,
            CheckpointType.SOURCES_FOUND,
            CheckpointType.GAPS_IDENTIFIED
        ]
        results.add_pass("CheckpointType enum", f"Types: {[c.value for c in checkpoint_types]}")
    except Exception as e:
        results.add_fail("CheckpointType enum", str(e))

    # Test 4: Check research_sessions table
    try:
        from db_utils import execute_query
        result = execute_query(
            "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'research_sessions')",
            fetch='one'
        )
        if result and result.get('exists'):
            results.add_pass("research_sessions table exists")
        else:
            results.add_skip("research_sessions table", "Run migration 004_v3_features.sql")
    except Exception as e:
        results.add_fail("research_sessions table", str(e))

    # Test 5: Session initialization
    try:
        session = InteractiveResearchSession(
            question="Test question about philosophy?",
            interactive=False
        )
        results.add_pass("Session initialization", f"State: {session.state.value}")
    except Exception as e:
        results.add_fail("Session initialization", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Knowledge Rules
# =============================================================================
def test_knowledge_rules(results: TestResults):
    """Test knowledge rules (living glossaries)."""
    print_header("KNOWLEDGE RULES TESTS (v3.0)")

    # Test 1: Knowledge rules import
    try:
        from knowledge_rules import (
            KnowledgeRule,
            KnowledgeRuleEngine
        )
        results.add_pass("Knowledge rules import")
    except Exception as e:
        results.add_fail("Knowledge rules import", str(e))
        return

    # Test 2: KnowledgeRule data class
    try:
        rule = KnowledgeRule(
            rule_type='alias',
            rule_definition={'primary': 'Rudolf Steiner', 'aliases': ['R. Steiner']},
            rule_name='Steiner alias',
            source='user'
        )
        if rule.rule_type == 'alias':
            results.add_pass("KnowledgeRule creation")
        else:
            results.add_fail("KnowledgeRule creation", "Invalid rule")
    except Exception as e:
        results.add_fail("KnowledgeRule creation", str(e))

    # Test 3: Rule to_dict
    try:
        rule_dict = rule.to_dict()
        if 'rule_type' in rule_dict and 'rule_definition' in rule_dict:
            results.add_pass("KnowledgeRule.to_dict()")
        else:
            results.add_fail("KnowledgeRule.to_dict()", "Missing fields")
    except Exception as e:
        results.add_fail("KnowledgeRule.to_dict()", str(e))

    # Test 4: KnowledgeRuleEngine initialization (may fail without database)
    try:
        engine = KnowledgeRuleEngine(auto_load=False)
        results.add_pass("KnowledgeRuleEngine initialization")
    except Exception as e:
        results.add_skip("KnowledgeRuleEngine initialization", f"Database required: {str(e)[:50]}")

    # Test 5: Check knowledge_rules table
    try:
        from db_utils import execute_query
        result = execute_query(
            "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'knowledge_rules')",
            fetch='one'
        )
        if result and result.get('exists'):
            results.add_pass("knowledge_rules table exists")
        else:
            results.add_skip("knowledge_rules table", "Table not created yet")
    except Exception as e:
        results.add_fail("knowledge_rules table", str(e))


# =============================================================================
# TEST MODULE: V3 Features - Book Research Workflow
# =============================================================================
def test_book_workflow(results: TestResults):
    """Test book research workflow components."""
    print_header("BOOK RESEARCH WORKFLOW TESTS (v3.0)")

    # Test 1: Workflow models import
    try:
        from book_workflow_models import (
            BookProject,
            ChapterProject,
            SubjectResearch,
            WorkflowCheckpoint,
            generate_project_id
        )
        results.add_pass("Workflow models import")
    except Exception as e:
        results.add_fail("Workflow models import", str(e))
        return

    # Test 2: Input parsers import
    try:
        from book_workflow_input import (
            parse_yaml_outline,
            parse_subjects_file,
            parse_subjects_list
        )
        results.add_pass("Workflow input parsers import")
    except Exception as e:
        results.add_fail("Workflow input parsers import", str(e))

    # Test 3: Phase executors import
    try:
        from book_workflow_phases import (
            Phase1Executor,
            Phase2Executor,
            Phase3Executor
        )
        results.add_pass("Workflow phase executors import")
    except Exception as e:
        results.add_fail("Workflow phase executors import", str(e))

    # Test 4: Output generators import
    try:
        from book_workflow_output import (
            generate_gaps_markdown,
            generate_research_summary,
            generate_sources_json
        )
        results.add_pass("Workflow output generators import")
    except Exception as e:
        results.add_fail("Workflow output generators import", str(e))

    # Test 5: Generate project ID
    try:
        project_id = generate_project_id()
        if project_id.startswith("BOOK_"):
            results.add_pass("Project ID generation", f"ID: {project_id}")
        else:
            results.add_fail("Project ID generation", "Invalid ID format")
    except Exception as e:
        results.add_fail("Project ID generation", str(e))

    # Test 6: BookProject creation
    try:
        project = BookProject(
            project_id=generate_project_id(),
            title="Test Book",
            author="Test Author"
        )
        results.add_pass("BookProject creation", f"Title: {project.title}")
    except Exception as e:
        results.add_fail("BookProject creation", str(e))

    # Test 7: ChapterProject creation
    try:
        chapter = ChapterProject(
            chapter_id="CH_01_test",
            chapter_number=1,
            title="Test Chapter",
            subjects=["Subject 1", "Subject 2"]
        )
        if chapter.chapter_number == 1 and len(chapter.subjects) == 2:
            results.add_pass("ChapterProject creation", f"Chapter: {chapter.title}")
        else:
            results.add_fail("ChapterProject creation", "Invalid chapter")
    except Exception as e:
        results.add_fail("ChapterProject creation", str(e))

    # Test 8: SubjectResearch creation
    try:
        research = SubjectResearch(
            subject="Test Subject",
            query_used="test query"
        )
        if research.status == "pending":
            results.add_pass("SubjectResearch creation")
        else:
            results.add_fail("SubjectResearch creation", f"Wrong status: {research.status}")
    except Exception as e:
        results.add_fail("SubjectResearch creation", str(e))

    # Test 9: WorkflowCheckpoint creation
    try:
        checkpoint = WorkflowCheckpoint(
            book_project_id=project.project_id,
            phase=1,
            chapter_index=0,
            subject_index=0
        )
        results.add_pass("WorkflowCheckpoint creation", f"Phase: {checkpoint.phase}")
    except Exception as e:
        results.add_fail("WorkflowCheckpoint creation", str(e))

    # Test 10: Workflow config exists
    try:
        from config import BOOK_WORKFLOW_CONFIG
        if 'projects_dir' in BOOK_WORKFLOW_CONFIG:
            results.add_pass("BOOK_WORKFLOW_CONFIG", f"Dir: {BOOK_WORKFLOW_CONFIG['projects_dir']}")
        else:
            results.add_fail("BOOK_WORKFLOW_CONFIG", "Missing projects_dir")
    except Exception as e:
        results.add_fail("BOOK_WORKFLOW_CONFIG", str(e))


# =============================================================================
# TEST MODULE: V3 Features - All V3 Tests Combined
# =============================================================================
def test_all_v3_features(results: TestResults):
    """Run all V3 feature tests."""
    print_header("V3 FEATURES TEST SUITE")
    print("Running all V3 feature tests...")

    test_citation_management(results)
    test_document_pinning(results)
    test_gap_pinning(results)
    test_context_selection(results)
    test_semantic_graph(results)
    test_research_sessions(results)
    test_knowledge_rules(results)
    test_book_workflow(results)


# =============================================================================
# TEST MODULE: Advanced RAG - Cross-Encoder Re-ranking (v2.1)
# =============================================================================
def test_reranking(results: TestResults):
    """Test cross-encoder re-ranking functionality."""
    print_header("CROSS-ENCODER RE-RANKING TESTS (v2.1)")

    # Test 1: sentence-transformers library available
    try:
        from sentence_transformers import CrossEncoder
        results.add_pass("sentence-transformers import")
    except ImportError:
        results.add_skip("sentence-transformers import", "Not installed - pip install sentence-transformers")
        return

    # Test 2: Re-ranking functions import
    try:
        from db_utils import rerank_results, hybrid_search_with_rerank, _get_cross_encoder
        results.add_pass("Re-ranking functions import")
    except Exception as e:
        results.add_fail("Re-ranking functions import", str(e))
        return

    # Test 3: Cross-encoder model loads
    try:
        encoder = _get_cross_encoder()
        if encoder is not None:
            results.add_pass("Cross-encoder model loaded")
        else:
            results.add_skip("Cross-encoder model loaded", "Model unavailable")
            return
    except Exception as e:
        results.add_fail("Cross-encoder model loaded", str(e))
        return

    # Test 4: Re-ranking with mock data
    try:
        mock_results = [
            {'chunk_id': 'C1', 'chunk_text': 'The etheric body relates to life forces.', 'title': 'Doc A'},
            {'chunk_id': 'C2', 'chunk_text': 'Mathematics is about numbers.', 'title': 'Doc B'},
            {'chunk_id': 'C3', 'chunk_text': 'The etheric body and memory are connected.', 'title': 'Doc C'},
        ]

        reranked = rerank_results(
            query="How does the etheric body relate to memory?",
            results=mock_results,
            top_k=3
        )

        if len(reranked) > 0 and 'rerank_score' in reranked[0]:
            # Check that C3 (most relevant) is ranked higher than C2 (irrelevant)
            scores = {r['chunk_id']: r['rerank_score'] for r in reranked}
            if scores.get('C3', 0) > scores.get('C2', 0):
                results.add_pass("Re-ranking accuracy", f"C3 ({scores['C3']:.3f}) > C2 ({scores['C2']:.3f})")
            else:
                results.add_fail("Re-ranking accuracy", f"Scores: {scores}")
        else:
            results.add_fail("Re-ranking execution", "No scores returned")
    except Exception as e:
        results.add_fail("Re-ranking execution", str(e))

    # Test 5: Hybrid search with re-ranking (if database has data)
    try:
        from db_utils import execute_query
        chunk_count = execute_query("SELECT COUNT(*) as count FROM chunks", fetch='one')
        if chunk_count and chunk_count.get('count', 0) > 0:
            results_hybrid = hybrid_search_with_rerank(
                query_text="philosophy of freedom",
                limit=5,
                initial_fetch=20,
                use_rerank=True
            )
            if results_hybrid:
                results.add_pass("Hybrid search with re-ranking", f"Found {len(results_hybrid)} results")
            else:
                results.add_pass("Hybrid search with re-ranking", "No results (empty index)")
        else:
            results.add_skip("Hybrid search with re-ranking", "No chunks in database")
    except Exception as e:
        results.add_fail("Hybrid search with re-ranking", str(e))


# =============================================================================
# TEST MODULE: Advanced RAG - GraphRAG (v2.1)
# =============================================================================
def test_graphrag(results: TestResults):
    """Test GraphRAG knowledge graph retrieval."""
    print_header("GRAPHRAG TESTS (v2.1)")

    # Test 1: GraphRAG functions import
    try:
        from db_utils import (
            graphrag_search,
            find_concept_by_name,
            get_related_concepts,
            get_chunks_for_concepts,
            get_chunk_with_context
        )
        results.add_pass("GraphRAG functions import")
    except Exception as e:
        results.add_fail("GraphRAG functions import", str(e))
        return

    # Test 2: Check if concepts exist in database
    try:
        from db_utils import execute_query
        concept_count = execute_query("SELECT COUNT(*) as count FROM concepts", fetch='one')
        if concept_count and concept_count.get('count', 0) > 0:
            results.add_pass("Concepts table populated", f"Count: {concept_count['count']}")
        else:
            results.add_skip("Concepts table populated", "No concepts - run extract_concepts.py first")
            # Still test the functions but with limited expectations
    except Exception as e:
        results.add_fail("Concepts table check", str(e))

    # Test 3: Concept lookup (fuzzy matching)
    try:
        from db_utils import execute_query
        # Get any existing concept to test with
        sample = execute_query("SELECT name FROM concepts LIMIT 1", fetch='one')
        if sample:
            concept_name = sample['name']
            found = find_concept_by_name(concept_name, fuzzy=True)
            if found and found.get('name'):
                results.add_pass("Fuzzy concept lookup", f"Found: {found['name']}")
            else:
                results.add_fail("Fuzzy concept lookup", "Concept not found")
        else:
            results.add_skip("Fuzzy concept lookup", "No concepts in database")
    except Exception as e:
        results.add_fail("Fuzzy concept lookup", str(e))

    # Test 4: Related concepts traversal
    try:
        from db_utils import execute_query
        sample = execute_query("SELECT concept_id FROM concepts LIMIT 1", fetch='one')
        if sample:
            related = get_related_concepts(
                concept_id=sample['concept_id'],
                depth=2,
                min_cooccurrence=1
            )
            results.add_pass("Related concepts traversal", f"Found {len(related)} related concepts")
        else:
            results.add_skip("Related concepts traversal", "No concepts in database")
    except Exception as e:
        results.add_fail("Related concepts traversal", str(e))

    # Test 5: Full GraphRAG search
    try:
        from db_utils import execute_query
        concept_count = execute_query("SELECT COUNT(*) as count FROM concepts", fetch='one')
        if concept_count and concept_count.get('count', 0) > 0:
            result = graphrag_search(
                query="philosophy freedom thinking",
                limit=5,
                hop_depth=2,
                include_direct_search=True
            )
            if result and 'results' in result:
                results.add_pass("GraphRAG search",
                    f"Concepts: {len(result.get('concepts', []))}, "
                    f"Paths: {len(result.get('graph_paths', []))}, "
                    f"Results: {len(result.get('results', []))}")
            else:
                results.add_fail("GraphRAG search", "Invalid response structure")
        else:
            results.add_skip("GraphRAG search", "No concepts in database")
    except Exception as e:
        results.add_fail("GraphRAG search", str(e))

    # Test 6: Context retrieval
    try:
        from db_utils import execute_query
        sample = execute_query("SELECT chunk_id FROM chunks LIMIT 1", fetch='one')
        if sample:
            context = get_chunk_with_context(
                chunk_id=sample['chunk_id'],
                context_chunks=1,
                use_parent=True
            )
            if context and 'chunk' in context:
                results.add_pass("Chunk context retrieval", "Context returned")
            else:
                results.add_fail("Chunk context retrieval", "No context returned")
        else:
            results.add_skip("Chunk context retrieval", "No chunks in database")
    except Exception as e:
        results.add_fail("Chunk context retrieval", str(e))


# =============================================================================
# TEST MODULE: Advanced RAG - Semantic Chunking (v2.1)
# =============================================================================
def test_semantic_chunking(results: TestResults):
    """Test semantic chunking with embedding-based boundaries."""
    print_header("SEMANTIC CHUNKING TESTS (v2.1)")

    # Test 1: sentence-transformers for embeddings
    try:
        from sentence_transformers import SentenceTransformer
        results.add_pass("SentenceTransformer import")
    except ImportError:
        results.add_skip("SentenceTransformer import", "Not installed - pip install sentence-transformers")
        return

    # Test 2: SemanticChunker import
    try:
        from chunk_documents import SemanticChunker
        results.add_pass("SemanticChunker import")
    except Exception as e:
        results.add_fail("SemanticChunker import", str(e))
        return

    # Test 3: SemanticChunker initialization
    try:
        chunker = SemanticChunker(
            min_tokens=50,
            max_tokens=500,
            target_tokens=200,
            similarity_threshold=0.5,
            window_size=2
        )
        results.add_pass("SemanticChunker initialization")
    except Exception as e:
        results.add_fail("SemanticChunker initialization", str(e))
        return

    # Test 4: Embedding model loads
    try:
        model = chunker._get_embedding_model()
        if model is not None:
            results.add_pass("Embedding model loaded", "all-MiniLM-L6-v2")
        else:
            results.add_skip("Embedding model loaded", "Model unavailable")
            return
    except Exception as e:
        results.add_fail("Embedding model loaded", str(e))
        return

    # Test 5: Semantic boundary detection
    test_text = """
    The philosophy of freedom explores human consciousness and thinking.
    Pure thinking leads to genuine self-determined action.
    This is central to Steiner's epistemology.

    Now we discuss a completely different topic about cooking.
    Recipes require precise measurements and timing.
    Baking bread needs patience and proper temperature.

    Returning to philosophy, ethics is concerned with moral principles.
    Right action stems from individual moral intuition.
    Freedom and responsibility are interconnected concepts.
    """

    try:
        sentences = chunker.split_into_sentences(test_text)
        boundaries = chunker.find_semantic_boundaries(sentences)
        if len(boundaries) > 0:
            results.add_pass("Semantic boundary detection", f"Found {len(boundaries)} boundaries")
        else:
            # May return empty if text is too short or uniform
            results.add_pass("Semantic boundary detection", "No boundaries (text may be too uniform)")
    except Exception as e:
        results.add_fail("Semantic boundary detection", str(e))

    # Test 6: Full semantic chunking
    try:
        chunks = chunker.chunk_semantic(test_text, "TEST_DOC_001")
        if len(chunks) > 0:
            results.add_pass("Semantic chunking", f"Created {len(chunks)} chunks")
            # Verify chunk structure
            if 'chunk_method' in chunks[0] and chunks[0]['chunk_method'] == 'semantic':
                results.add_pass("Semantic chunk metadata", "chunk_method='semantic'")
            else:
                results.add_fail("Semantic chunk metadata", f"Got: {chunks[0].get('chunk_method')}")
        else:
            results.add_fail("Semantic chunking", "No chunks created")
    except Exception as e:
        results.add_fail("Semantic chunking", str(e))


# =============================================================================
# TEST MODULE: Advanced RAG - Research Agent (v2.1)
# =============================================================================
def test_research_agent(results: TestResults, api_key: str = None):
    """Test autonomous research agent."""
    print_header("RESEARCH AGENT TESTS (v2.1)")

    # Test 1: Research agent import
    try:
        from research_agent import ResearchAgent, ResearchSession, format_markdown_report
        results.add_pass("Research agent import")
    except Exception as e:
        results.add_fail("Research agent import", str(e))
        return

    # Test 2: LLM interface initialization
    try:
        from research_agent import LLMInterface
        llm = LLMInterface()
        if llm.client is not None:
            results.add_pass("LLM interface initialized", f"Mode: {llm.mode}, Model: {llm.model}")
        else:
            results.add_pass("LLM interface initialized", "Fallback mode (no LLM)")
    except Exception as e:
        results.add_fail("LLM interface initialized", str(e))

    # Test 3: Research agent initialization
    try:
        agent = ResearchAgent(
            max_iterations=2,
            min_results_per_query=1,
            use_graphrag=True,
            use_rerank=True
        )
        results.add_pass("Research agent initialization")
    except Exception as e:
        results.add_fail("Research agent initialization", str(e))
        return

    # Test 4: Query planning
    try:
        sub_queries = agent.plan_research("Compare Steiner and Jung on dreams")
        if isinstance(sub_queries, list) and len(sub_queries) > 0:
            results.add_pass("Query planning", f"Generated {len(sub_queries)} sub-queries")
        else:
            results.add_fail("Query planning", f"Invalid result: {sub_queries}")
    except Exception as e:
        results.add_fail("Query planning", str(e))

    # Test 5: Search execution (requires database content)
    try:
        from db_utils import execute_query
        chunk_count = execute_query("SELECT COUNT(*) as count FROM chunks", fetch='one')
        if chunk_count and chunk_count.get('count', 0) > 0:
            search_results = agent.execute_search("philosophy of freedom")
            results.add_pass("Agent search execution", f"Found {len(search_results)} results")
        else:
            results.add_skip("Agent search execution", "No chunks in database")
    except Exception as e:
        results.add_fail("Agent search execution", str(e))

    # Test 6: Full research session (only if LLM available and database has content)
    try:
        from db_utils import execute_query
        chunk_count = execute_query("SELECT COUNT(*) as count FROM chunks", fetch='one')
        has_chunks = chunk_count and chunk_count.get('count', 0) > 5

        if has_chunks and (api_key or agent.llm.client is not None):
            # Run a short research session
            agent_short = ResearchAgent(max_iterations=2, use_graphrag=False, use_rerank=False)
            session = agent_short.research("What is consciousness?")

            if isinstance(session, ResearchSession):
                results.add_pass("Full research session",
                    f"Iterations: {session.iterations}, Chunks: {session.total_chunks}")

                # Test report generation
                report = format_markdown_report(session)
                if '# Research Report' in report:
                    results.add_pass("Report generation", f"Length: {len(report)} chars")
                else:
                    results.add_fail("Report generation", "Invalid report format")
            else:
                results.add_fail("Full research session", "Invalid session object")
        else:
            reason = "No LLM available" if not (api_key or agent.llm.client) else "Not enough data"
            results.add_skip("Full research session", reason)
    except Exception as e:
        results.add_fail("Full research session", str(e))


# =============================================================================
# TEST MODULE: Advanced RAG - Comparison Tests (v2.1)
# =============================================================================
def test_rag_comparison(results: TestResults):
    """Compare standard vs advanced RAG search results."""
    print_header("RAG COMPARISON TESTS (v2.1)")

    # Test 1: Standard keyword search baseline
    try:
        from db_utils import keyword_search, execute_query

        chunk_count = execute_query("SELECT COUNT(*) as count FROM chunks", fetch='one')
        if not chunk_count or chunk_count.get('count', 0) < 5:
            results.add_skip("RAG comparison", "Not enough data in database")
            return

        baseline_results = keyword_search(
            query_text="etheric body memory",
            limit=10
        )
        results.add_pass("Keyword search baseline", f"Found {len(baseline_results)} results")
    except Exception as e:
        results.add_fail("Keyword search baseline", str(e))
        return

    # Test 2: GraphRAG retrieval (multi-hop)
    try:
        from db_utils import graphrag_search

        graphrag_results = graphrag_search(
            query="etheric body memory",
            limit=10,
            hop_depth=2,
            include_direct_search=False  # Only graph-based
        )

        graph_chunks = len(graphrag_results.get('results', []))
        graph_paths = len(graphrag_results.get('graph_paths', []))

        if graph_paths > 0:
            results.add_pass("GraphRAG multi-hop",
                f"Chunks: {graph_chunks}, Graph paths: {graph_paths}")
        else:
            results.add_pass("GraphRAG multi-hop", f"Chunks: {graph_chunks} (no paths found)")
    except Exception as e:
        results.add_fail("GraphRAG multi-hop", str(e))

    # Test 3: Hybrid + Re-ranking comparison
    try:
        from db_utils import hybrid_search_with_rerank

        # Without re-ranking
        results_no_rerank = hybrid_search_with_rerank(
            query_text="etheric body memory",
            limit=5,
            use_rerank=False
        )

        # With re-ranking
        results_with_rerank = hybrid_search_with_rerank(
            query_text="etheric body memory",
            limit=5,
            initial_fetch=20,
            use_rerank=True
        )

        if results_with_rerank and results_no_rerank:
            # Check if rankings changed
            ids_no_rerank = [r.get('chunk_id') for r in results_no_rerank[:3]]
            ids_with_rerank = [r.get('chunk_id') for r in results_with_rerank[:3]]

            if ids_no_rerank != ids_with_rerank:
                results.add_pass("Re-ranking changes order", "Top-3 results differ")
            else:
                results.add_pass("Re-ranking stable", "Top-3 results unchanged")
        else:
            results.add_pass("Hybrid search comparison", "Results returned")
    except Exception as e:
        results.add_fail("Hybrid search comparison", str(e))

    # Test 4: Precision test - specific question
    try:
        from db_utils import hybrid_search_with_rerank

        # This query tests if re-ranking helps with specific factual questions
        specific_results = hybrid_search_with_rerank(
            query_text="What is the exact date of the Christmas Conference?",
            limit=10,
            initial_fetch=50,
            use_rerank=True
        )

        if specific_results and len(specific_results) > 0:
            top_result = specific_results[0]
            results.add_pass("Precision query test",
                f"Top result score: {top_result.get('rerank_score', top_result.get('rrf_score', 'N/A'))}")
        else:
            results.add_pass("Precision query test", "No results (may not be in corpus)")
    except Exception as e:
        results.add_fail("Precision query test", str(e))


# =============================================================================
# TEST MODULE: Embeddings (requires API key)
# =============================================================================
def test_embeddings(results: TestResults, api_key: str = None):
    """Test embedding generation."""
    print_header("EMBEDDING TESTS")

    if not api_key:
        results.add_skip("Embedding generation", "No API key provided")
        return

    os.environ['OPENAI_API_KEY'] = api_key

    try:
        from openai import OpenAI
        client = OpenAI(api_key=api_key)

        response = client.embeddings.create(
            model="text-embedding-3-small",
            input="Test embedding generation"
        )

        if response.data and len(response.data[0].embedding) == 1536:
            results.add_pass("OpenAI embedding generation", "1536 dimensions")
        else:
            results.add_fail("OpenAI embedding generation", "Unexpected response")
    except Exception as e:
        results.add_fail("OpenAI embedding generation", str(e))


# =============================================================================
# TEST MODULE: Document Processing
# =============================================================================
def test_document_processing(results: TestResults):
    """Test document processing capabilities."""
    print_header("DOCUMENT PROCESSING TESTS")

    # Create a test document with enough content for chunking
    test_doc_path = SCRIPT_DIR / 'test_documents' / 'test_document.txt'
    test_content = """
    # Test Document: Philosophy and Knowledge

    This is a comprehensive test document for the QA testing suite.
    It contains multiple paragraphs to thoroughly test the chunking algorithm.
    The document explores various topics related to philosophy, knowledge, and understanding.

    ## Section One: The Nature of Knowledge

    The first section discusses the fundamental nature of knowledge and how we come to understand the world around us. Epistemology is the branch of philosophy concerned with the theory of knowledge. It asks questions like: What is knowledge? How is knowledge acquired? What do people know? How do we know what we know?

    Knowledge has been defined in various ways throughout history. Plato famously defined knowledge as justified true belief. This means that for something to count as knowledge, it must be believed, it must be true, and there must be adequate justification for the belief. However, this definition has been challenged by various counterexamples.

    The scientific method represents one approach to acquiring knowledge. Through observation, hypothesis formation, experimentation, and analysis, scientists build up our understanding of the natural world. This process is iterative and self-correcting, with new evidence leading to revised theories.

    ## Section Two: Understanding and Comprehension

    The second section explores the nature of understanding and how it relates to knowledge. Understanding is often considered a deeper form of knowing than mere factual knowledge. To understand something is to grasp how its parts relate to each other and to the whole.

    Comprehension involves making connections between new information and existing knowledge. When we truly understand something, we can explain it to others, apply it in new situations, and see its implications. This goes beyond simple memorization or recall.

    Educational research has shown that deep understanding requires active engagement with material. Passive reading or listening is less effective than activities that require learners to process and transform information. Testing, discussion, and application all promote deeper learning.

    ## Section Three: Testing and Quality Assurance

    Testing is a crucial component of ensuring software quality and reliability. Quality assurance helps identify bugs and issues early in the development process, when they are less costly to fix. A comprehensive testing strategy includes unit tests, integration tests, and end-to-end tests.

    Automated testing allows for rapid feedback during development. Continuous integration systems can run tests automatically whenever code changes are committed. This helps catch regressions quickly and maintains confidence in the codebase.

    Manual testing remains important for aspects that are difficult to automate. User experience testing, exploratory testing, and acceptance testing often require human judgment. The best testing strategies combine automated and manual approaches.

    ## Conclusion

    This document has explored various aspects of knowledge, understanding, and testing. These concepts are interconnected: testing requires knowledge of what to test, understanding helps in designing effective tests, and the testing process itself generates new knowledge about the system under test.

    Quality assurance is not just about finding bugs; it is about building confidence that the system meets its requirements and serves its users well. This requires both technical skills and domain understanding.
    """

    try:
        test_doc_path.parent.mkdir(parents=True, exist_ok=True)
        test_doc_path.write_text(test_content)
        results.add_pass("Test document created")
    except Exception as e:
        results.add_fail("Test document created", str(e))
        return

    # Test text extraction
    try:
        from ingest_documents import DocumentProcessor
        processor = DocumentProcessor(dry_run=True)

        # Test file type detection
        if test_doc_path.suffix in ['.txt', '.md']:
            results.add_pass("File type detection")

        # Test text reading
        text = test_doc_path.read_text()
        if len(text) > 100:
            results.add_pass("Text extraction", f"Length: {len(text)}")
        else:
            results.add_fail("Text extraction", "Text too short")
    except Exception as e:
        results.add_fail("Text extraction", str(e))

    # Test chunking
    try:
        from chunk_documents import TextChunker
        chunker = TextChunker()
        chunks = list(chunker.chunk_text(test_content, "test_doc"))  # Convert generator to list
        if len(chunks) > 0:
            results.add_pass("Text chunking", f"Chunks: {len(chunks)}")
        else:
            results.add_fail("Text chunking", "No chunks created")
    except Exception as e:
        results.add_fail("Text chunking", str(e))

    # Cleanup
    try:
        test_doc_path.unlink()
    except:
        pass


# =============================================================================
# TEST MODULE: RDF CLI
# =============================================================================
def test_rdf_cli(results: TestResults):
    """Test rdf CLI commands."""
    print_header("RDF CLI TESTS")

    import subprocess

    rdf_path = PROJECT_DIR / 'rdf'

    # Test 1: rdf script exists and is executable
    try:
        if rdf_path.exists():
            results.add_pass("rdf script exists", str(rdf_path))
        else:
            results.add_fail("rdf script exists", f"Not found: {rdf_path}")
            return
    except Exception as e:
        results.add_fail("rdf script exists", str(e))
        return

    # Test 2: rdf --help works
    try:
        result = subprocess.run(
            ['python3', str(rdf_path), '--help'],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode == 0 and 'usage' in result.stdout.lower():
            results.add_pass("rdf --help", "Help output displayed")
        else:
            results.add_fail("rdf --help", f"Exit code: {result.returncode}")
    except Exception as e:
        results.add_fail("rdf --help", str(e))

    # Test 3: rdf commands returns JSON
    try:
        result = subprocess.run(
            ['python3', str(rdf_path), 'commands'],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode == 0:
            try:
                commands = json.loads(result.stdout)
                if isinstance(commands, dict) and 'commands' in commands:
                    cmd_count = len(commands['commands'])
                    results.add_pass("rdf commands", f"Found {cmd_count} commands")
                else:
                    results.add_pass("rdf commands", "JSON returned")
            except json.JSONDecodeError:
                results.add_fail("rdf commands", "Invalid JSON output")
        else:
            results.add_fail("rdf commands", f"Exit code: {result.returncode}")
    except Exception as e:
        results.add_fail("rdf commands", str(e))

    # Test 4: rdf status works
    try:
        result = subprocess.run(
            ['python3', str(rdf_path), 'status', '--format', 'json'],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode == 0:
            try:
                status = json.loads(result.stdout)
                if 'status' in status:
                    results.add_pass("rdf status", f"Status: {status.get('status')}")
                else:
                    results.add_pass("rdf status", "JSON returned")
            except json.JSONDecodeError:
                results.add_skip("rdf status", "Non-JSON output (may be text mode)")
        else:
            results.add_skip("rdf status", f"Exit code: {result.returncode}")
    except Exception as e:
        results.add_fail("rdf status", str(e))

    # Test 5: rdf health works
    try:
        result = subprocess.run(
            ['python3', str(rdf_path), 'health', '--format', 'json'],
            capture_output=True, text=True, timeout=60
        )
        if result.returncode == 0:
            try:
                health = json.loads(result.stdout)
                if 'status' in health:
                    results.add_pass("rdf health", f"Status: {health.get('status')}")
                else:
                    results.add_pass("rdf health", "JSON returned")
            except json.JSONDecodeError:
                results.add_skip("rdf health", "Non-JSON output")
        else:
            results.add_skip("rdf health", f"Exit code: {result.returncode}")
    except Exception as e:
        results.add_fail("rdf health", str(e))

    # Test 6: rdf diff works with test files
    try:
        # Create two temp files
        file_a = PROJECT_DIR / 'QA_TEST' / 'temp_a.txt'
        file_b = PROJECT_DIR / 'QA_TEST' / 'temp_b.txt'
        file_a.write_text("Line 1\nLine 2\nLine 3\n")
        file_b.write_text("Line 1\nLine 2 modified\nLine 3\n")

        result = subprocess.run(
            ['python3', str(rdf_path), 'diff', str(file_a), str(file_b), '--format', 'json'],
            capture_output=True, text=True, timeout=30
        )

        # Cleanup
        file_a.unlink()
        file_b.unlink()

        if result.returncode == 0:
            results.add_pass("rdf diff", "Diff comparison works")
        else:
            results.add_skip("rdf diff", f"Exit code: {result.returncode}")
    except Exception as e:
        results.add_fail("rdf diff", str(e))


# =============================================================================
# MAIN TEST RUNNER
# =============================================================================
def run_all_tests(api_key: str = None, quick: bool = False, module: str = None) -> TestResults:
    """Run all test modules."""
    results = TestResults()
    results.start_time = time.time()

    print("\n" + "="*60)
    print(" RESEARCH DEVELOPMENT FRAMEWORK - QA TEST SUITE")
    print("="*60)
    print(f" Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f" API Key: {'Provided' if api_key else 'Not provided'}")
    if module:
        print(f" Module:  {module}")
    print("="*60)

    # If specific module requested
    if module:
        module_map = {
            'config': test_configuration,
            'database': test_database,
            'libraries': test_libraries,
            'pipeline': test_pipeline,
            'classification': test_statistical_classification,
            'clustering': test_statistical_clustering,
            'processing': test_document_processing,
            'api': test_api_endpoints,
            'v3_schema': test_v3_schema,
            'knowledge_graph': test_knowledge_graph,
            'reranking': test_reranking,
            'graphrag': test_graphrag,
            'semantic_chunking': test_semantic_chunking,
            'research_agent': lambda r: test_research_agent(r, api_key),
            'rag_comparison': test_rag_comparison,
            'rag': lambda r: run_advanced_rag_tests(r, api_key),  # All RAG tests
            # V3 Feature modules
            'citation': test_citation_management,
            'pinning': test_document_pinning,
            'gaps': test_gap_pinning,
            'context': test_context_selection,
            'semantic_graph': test_semantic_graph,
            'workflow': test_book_workflow,
            'v4': test_rdf_cli,  # CLI tests
            'rdf_cli': test_rdf_cli,  # Alias for v4
        }

        if module in module_map:
            test_func = module_map[module]
            if callable(test_func):
                test_func(results)
        else:
            print(f"Unknown module: {module}")
            print(f"Available modules: {', '.join(module_map.keys())}")

        results.end_time = time.time()
        print(results.summary())
        return results

    # Always run these tests
    test_configuration(results)
    test_libraries(results)
    test_database(results)
    test_pipeline(results)

    if not quick:
        test_statistical_classification(results)
        test_statistical_clustering(results)
        test_document_processing(results)
        test_api_endpoints(results)

        # v3.0 Schema tests
        test_v3_schema(results)
        test_knowledge_graph(results)

        # v2.1 Advanced RAG tests
        test_reranking(results)
        test_graphrag(results)
        test_semantic_chunking(results)
        test_research_agent(results, api_key)
        test_rag_comparison(results)

        # v3.0 Feature tests
        test_citation_management(results)
        test_document_pinning(results)
        test_gap_pinning(results)
        test_context_selection(results)
        test_semantic_graph(results)
        test_research_sessions(results)
        test_knowledge_rules(results)
        test_book_workflow(results)

        # Cloud tests (require API key)
        if api_key:
            test_cloud_classification(results, api_key)
            test_embeddings(results, api_key)

    results.end_time = time.time()
    print(results.summary())

    if results.errors:
        print("\nFAILED TESTS:")
        print("-"*60)
        for name, message in results.errors:
            print(f"  {name}: {message}")

    return results


def run_advanced_rag_tests(results: TestResults, api_key: str = None):
    """Run all Advanced RAG tests (v2.1)."""
    print_header("ADVANCED RAG TEST SUITE (v2.1)")
    print("Running all Advanced RAG feature tests...")

    test_reranking(results)
    test_graphrag(results)
    test_semantic_chunking(results)
    test_research_agent(results, api_key)
    test_rag_comparison(results)


def generate_report(results: TestResults, output_path: Path):
    """Generate HTML test report."""
    html = f"""
<!DOCTYPE html>
<html>
<head>
    <title>QA Test Report - Research Development Framework</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 40px; }}
        h1 {{ color: #333; }}
        .summary {{ background: #f5f5f5; padding: 20px; border-radius: 8px; margin: 20px 0; }}
        .pass {{ color: #28a745; }}
        .fail {{ color: #dc3545; }}
        .skip {{ color: #ffc107; }}
        table {{ border-collapse: collapse; width: 100%; margin-top: 20px; }}
        th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
        th {{ background: #333; color: white; }}
        tr:nth-child(even) {{ background: #f9f9f9; }}
    </style>
</head>
<body>
    <h1>QA Test Report</h1>
    <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>

    <div class="summary">
        <h2>Summary</h2>
        <p><strong>Total:</strong> {results.passed + results.failed + results.skipped}</p>
        <p class="pass"><strong>Passed:</strong> {results.passed}</p>
        <p class="fail"><strong>Failed:</strong> {results.failed}</p>
        <p class="skip"><strong>Skipped:</strong> {results.skipped}</p>
        <p><strong>Duration:</strong> {(results.end_time - results.start_time):.2f}s</p>
    </div>

    <h2>Test Details</h2>
    <table>
        <tr><th>Status</th><th>Test Name</th><th>Message</th></tr>
        {''.join(f'<tr class="{status.lower()}"><td>{status}</td><td>{name}</td><td>{msg}</td></tr>' for status, name, msg in results.details)}
    </table>
</body>
</html>
"""
    output_path.write_text(html)
    print(f"\nReport saved to: {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description='QA Test Runner for Research Development Framework v3.0',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Available test modules:
  config            Configuration loading
  database          Database connectivity
  libraries         Required libraries
  pipeline          Pipeline components
  classification    Statistical classification
  clustering        Statistical clustering
  processing        Document processing
  api               API endpoints
  v3_schema         v3.0 schema features
  knowledge_graph   Knowledge graph extraction

  Advanced RAG (v2.1):
  reranking         Cross-encoder re-ranking
  graphrag          GraphRAG knowledge graph retrieval
  semantic_chunking Semantic chunking
  research_agent    Autonomous research agent
  rag_comparison    RAG comparison tests
  rag               All Advanced RAG tests

  V3 Features:
  citation          Citation key management and freezing
  pinning           Document pinning
  gaps              Research gap pinning
  context           Smart context selection
  semantic_graph    Semantic graph traversal
  sessions          Interactive research sessions
  rules             Knowledge rules (living glossaries)
  workflow          Book research workflow
  v3                All V3 feature tests

Examples:
  python run_tests.py                    # Run all tests
  python run_tests.py --module rag       # Run Advanced RAG tests only
  python run_tests.py --module v3        # Run V3 feature tests only
  python run_tests.py --module workflow  # Run book workflow tests
  python run_tests.py --quick            # Quick smoke tests only
  python run_tests.py --report           # Generate HTML report
        """
    )
    parser.add_argument('--module', type=str, help='Run specific test module')
    parser.add_argument('--quick', action='store_true', help='Quick smoke tests only')
    parser.add_argument('--report', action='store_true', help='Generate HTML report')
    parser.add_argument('--api-key', type=str, help='OpenAI API key for cloud tests')
    parser.add_argument('--use-steiner-key', action='store_true', help='Use API key from STEINER project')

    args = parser.parse_args()

    api_key = args.api_key

    # Load API key from STEINER if requested
    if args.use_steiner_key:
        steiner_env = Path('/var/www/html/research/STEINER/.env')
        if steiner_env.exists():
            for line in steiner_env.read_text().splitlines():
                if line.startswith('OPENAI_API_KEY=') and not line.endswith('='):
                    api_key = line.split('=', 1)[1].strip()
                    print(f"Loaded API key from STEINER project")
                    break

    results = run_all_tests(api_key=api_key, quick=args.quick, module=args.module)

    if args.report:
        report_path = SCRIPT_DIR / 'reports' / f'test_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.html'
        report_path.parent.mkdir(parents=True, exist_ok=True)
        generate_report(results, report_path)

    # Return exit code based on results
    sys.exit(0 if results.failed == 0 else 1)


if __name__ == '__main__':
    main()
