-- =============================================================================
-- RESEARCH DEVELOPMENT FRAMEWORK - PostgreSQL Database Schema
-- =============================================================================
-- A comprehensive schema for document management, semantic search, and research
-- Based on proven patterns from STEINER and FREEMASON projects
--
-- Requirements:
--   - PostgreSQL 16+
--   - pgvector extension (for semantic search)
--   - pg_trgm extension (for fuzzy text matching)
-- =============================================================================

-- Enable required extensions
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;

-- =============================================================================
-- CORE TABLES
-- =============================================================================

-- -----------------------------------------------------------------------------
-- Authors Table
-- Stores information about document authors
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS authors (
    author_id SERIAL PRIMARY KEY,
    name VARCHAR(255) NOT NULL,
    name_normalized VARCHAR(255),           -- Lowercase, trimmed for matching
    birth_year INTEGER,
    death_year INTEGER,
    biography TEXT,
    nationality VARCHAR(100),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_authors_name ON authors(name);
CREATE INDEX idx_authors_name_normalized ON authors(name_normalized);
CREATE INDEX idx_authors_name_trgm ON authors USING gin(name gin_trgm_ops);

-- -----------------------------------------------------------------------------
-- Documents Table
-- Main document metadata storage
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS documents (
    document_id VARCHAR(100) PRIMARY KEY,   -- Unique identifier (e.g., PROJECT_001_EN)
    title VARCHAR(500) NOT NULL,
    subtitle VARCHAR(500),
    author_id INTEGER REFERENCES authors(author_id),
    publication_year INTEGER,
    language_code VARCHAR(10) DEFAULT 'en', -- ISO 639-1 code
    edition_type VARCHAR(50),               -- original, translation, revised, etc.
    publisher VARCHAR(255),
    source_file VARCHAR(500),               -- Original filename
    file_path VARCHAR(1000),                -- Path to markdown file
    content_hash VARCHAR(64),               -- SHA-256 for duplicate detection
    word_count INTEGER DEFAULT 0,
    page_count INTEGER DEFAULT 0,
    chapter_count INTEGER DEFAULT 0,
    has_index BOOLEAN DEFAULT FALSE,
    has_bibliography BOOLEAN DEFAULT FALSE,
    processing_status VARCHAR(50) DEFAULT 'pending',  -- pending, processing, completed, failed
    pipeline_version VARCHAR(20) DEFAULT '1.0.0',
    ai_generated BOOLEAN DEFAULT FALSE,
    ai_model VARCHAR(100),
    ai_prompt TEXT,
    notes TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_documents_title ON documents(title);
CREATE INDEX idx_documents_author ON documents(author_id);
CREATE INDEX idx_documents_year ON documents(publication_year);
CREATE INDEX idx_documents_language ON documents(language_code);
CREATE INDEX idx_documents_status ON documents(processing_status);
CREATE INDEX idx_documents_hash ON documents(content_hash);

-- -----------------------------------------------------------------------------
-- Files Table
-- Tracks all file versions and locations
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS files (
    file_id SERIAL PRIMARY KEY,
    document_id VARCHAR(100) REFERENCES documents(document_id),
    filename VARCHAR(500) NOT NULL,
    file_path VARCHAR(1000) NOT NULL,
    file_type VARCHAR(20),                  -- pdf, docx, md, txt, epub
    file_size_bytes BIGINT,
    content_hash VARCHAR(64),
    is_primary BOOLEAN DEFAULT FALSE,       -- Primary file for this document
    version INTEGER DEFAULT 1,
    upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    needs_reprocessing BOOLEAN DEFAULT FALSE
);

CREATE INDEX idx_files_document ON files(document_id);
CREATE INDEX idx_files_hash ON files(content_hash);
CREATE INDEX idx_files_type ON files(file_type);

-- -----------------------------------------------------------------------------
-- Categories Table
-- Document categorization/classification
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS categories (
    category_id SERIAL PRIMARY KEY,
    name VARCHAR(100) NOT NULL UNIQUE,
    description TEXT,
    parent_category_id INTEGER REFERENCES categories(category_id),
    sort_order INTEGER DEFAULT 0,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_categories_parent ON categories(parent_category_id);

-- Junction table for document-category relationships
CREATE TABLE IF NOT EXISTS document_categories (
    document_id VARCHAR(100) REFERENCES documents(document_id) ON DELETE CASCADE,
    category_id INTEGER REFERENCES categories(category_id) ON DELETE CASCADE,
    PRIMARY KEY (document_id, category_id)
);

-- -----------------------------------------------------------------------------
-- Topics Table
-- High-level research topics
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS topics (
    topic_id SERIAL PRIMARY KEY,
    name VARCHAR(100) NOT NULL UNIQUE,
    description TEXT,
    keywords TEXT[],                        -- Array of related keywords
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Junction table for document-topic relationships
CREATE TABLE IF NOT EXISTS document_topics (
    document_id VARCHAR(100) REFERENCES documents(document_id) ON DELETE CASCADE,
    topic_id INTEGER REFERENCES topics(topic_id) ON DELETE CASCADE,
    relevance_score FLOAT DEFAULT 1.0,      -- 0.0 to 1.0
    PRIMARY KEY (document_id, topic_id)
);

-- -----------------------------------------------------------------------------
-- Concepts Table
-- Fine-grained concept taxonomy for discovery
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS concepts (
    concept_id SERIAL PRIMARY KEY,
    name VARCHAR(100) NOT NULL UNIQUE,
    category VARCHAR(50),                   -- Group concepts by type
    description TEXT,
    aliases TEXT[],                         -- Alternative names
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_concepts_category ON concepts(category);

-- Junction table for document-concept relationships
CREATE TABLE IF NOT EXISTS document_concepts (
    document_id VARCHAR(100) REFERENCES documents(document_id) ON DELETE CASCADE,
    concept_id INTEGER REFERENCES concepts(concept_id) ON DELETE CASCADE,
    mention_count INTEGER DEFAULT 1,
    PRIMARY KEY (document_id, concept_id)
);

-- =============================================================================
-- CHUNKING AND EMBEDDING TABLES
-- =============================================================================

-- -----------------------------------------------------------------------------
-- Chunks Table
-- Searchable text units with embeddings
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS chunks (
    chunk_id VARCHAR(100) PRIMARY KEY,      -- e.g., DOC_001_C001
    document_id VARCHAR(100) NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
    chapter_number INTEGER,
    chunk_sequence INTEGER NOT NULL,        -- Order within document
    chunk_text TEXT NOT NULL,
    chunk_tokens INTEGER,                   -- Token count for this chunk
    embedding vector(1536),                 -- OpenAI text-embedding-3-small
    chunk_text_tsv tsvector,                -- Full-text search vector
    pipeline_version VARCHAR(20) DEFAULT '1.0.0',
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Performance indexes
CREATE INDEX idx_chunks_document ON chunks(document_id);
CREATE INDEX idx_chunks_sequence ON chunks(document_id, chunk_sequence);
CREATE INDEX idx_chunks_embedding ON chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
CREATE INDEX idx_chunks_fts ON chunks USING gin(chunk_text_tsv);

-- -----------------------------------------------------------------------------
-- Chunk Concepts Table
-- Fine-grained concept location mapping
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS chunk_concepts (
    chunk_id VARCHAR(100) REFERENCES chunks(chunk_id) ON DELETE CASCADE,
    concept_id INTEGER REFERENCES concepts(concept_id) ON DELETE CASCADE,
    mention_count INTEGER DEFAULT 1,
    PRIMARY KEY (chunk_id, concept_id)
);

-- =============================================================================
-- PROCESSING AND QUALITY TABLES
-- =============================================================================

-- -----------------------------------------------------------------------------
-- Processing Queue Table
-- Track documents awaiting processing
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS processing_queue (
    queue_id SERIAL PRIMARY KEY,
    document_id VARCHAR(100) REFERENCES documents(document_id),
    file_id INTEGER REFERENCES files(file_id),
    process_type VARCHAR(50) NOT NULL,      -- ingest, chunk, embed, extract_concepts
    priority INTEGER DEFAULT 5,             -- 1 (highest) to 10 (lowest)
    status VARCHAR(20) DEFAULT 'pending',   -- pending, processing, completed, failed
    error_message TEXT,
    attempts INTEGER DEFAULT 0,
    max_attempts INTEGER DEFAULT 3,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    started_at TIMESTAMP,
    completed_at TIMESTAMP
);

CREATE INDEX idx_queue_status ON processing_queue(status);
CREATE INDEX idx_queue_priority ON processing_queue(priority, created_at);

-- -----------------------------------------------------------------------------
-- Text Quality Table
-- OCR quality assessment and processing flags
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS text_quality (
    quality_id SERIAL PRIMARY KEY,
    document_id VARCHAR(100) REFERENCES documents(document_id) ON DELETE CASCADE,
    file_id INTEGER REFERENCES files(file_id),
    quality_grade VARCHAR(20),              -- excellent, good, fair, poor, very_poor
    gibberish_ratio FLOAT,                  -- 0.0 to 1.0
    avg_word_length FLOAT,
    punctuation_ratio FLOAT,
    sentence_quality_score FLOAT,
    do_not_process BOOLEAN DEFAULT FALSE,   -- Flag for unfixable documents
    process_notes TEXT,                      -- Notes about quality issues
    assessed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_quality_document ON text_quality(document_id);
CREATE INDEX idx_quality_grade ON text_quality(quality_grade);

-- -----------------------------------------------------------------------------
-- Change Log Table
-- Audit trail for document changes
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS change_log (
    log_id SERIAL PRIMARY KEY,
    document_id VARCHAR(100),
    file_id INTEGER,
    action VARCHAR(50) NOT NULL,            -- create, update, delete, reprocess
    old_value JSONB,
    new_value JSONB,
    changed_by VARCHAR(100) DEFAULT 'system',
    changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_changelog_document ON change_log(document_id);
CREATE INDEX idx_changelog_action ON change_log(action);
CREATE INDEX idx_changelog_date ON change_log(changed_at);

-- =============================================================================
-- SEARCH AND ANALYTICS TABLES
-- =============================================================================

-- -----------------------------------------------------------------------------
-- Search History Table
-- Query analytics and performance tracking
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS search_history (
    search_id SERIAL PRIMARY KEY,
    query_text TEXT NOT NULL,
    search_type VARCHAR(20),                -- semantic, keyword, hybrid
    filters_applied JSONB,
    results_count INTEGER,
    response_time_ms INTEGER,
    user_session VARCHAR(100),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_search_type ON search_history(search_type);
CREATE INDEX idx_search_date ON search_history(created_at);

-- =============================================================================
-- BOOK COMPILATION TABLES
-- =============================================================================

-- -----------------------------------------------------------------------------
-- Book Projects Table
-- Track book compilation projects
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS book_projects (
    project_id SERIAL PRIMARY KEY,
    project_name VARCHAR(255) NOT NULL,
    description TEXT,
    status VARCHAR(50) DEFAULT 'draft',     -- draft, in_progress, review, published
    output_formats TEXT[] DEFAULT ARRAY['md', 'docx', 'epub'],
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- -----------------------------------------------------------------------------
-- Book Chapters Table
-- Individual chapters within book projects
-- -----------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS book_chapters (
    chapter_id SERIAL PRIMARY KEY,
    project_id INTEGER REFERENCES book_projects(project_id) ON DELETE CASCADE,
    chapter_number INTEGER NOT NULL,
    title VARCHAR(255),
    source_document_id VARCHAR(100) REFERENCES documents(document_id),
    source_file_path VARCHAR(1000),
    content_markdown TEXT,
    word_count INTEGER DEFAULT 0,
    status VARCHAR(50) DEFAULT 'draft',     -- draft, written, edited, final
    sort_order INTEGER,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_chapters_project ON book_chapters(project_id);
CREATE INDEX idx_chapters_order ON book_chapters(project_id, sort_order);

-- =============================================================================
-- HELPER VIEWS
-- =============================================================================

-- View: Document overview with author information
CREATE OR REPLACE VIEW v_documents_full AS
SELECT
    d.document_id,
    d.title,
    d.subtitle,
    a.name AS author_name,
    d.publication_year,
    d.language_code,
    d.word_count,
    d.processing_status,
    d.created_at
FROM documents d
LEFT JOIN authors a ON d.author_id = a.author_id;

-- View: Processing queue status summary
CREATE OR REPLACE VIEW v_queue_status AS
SELECT
    process_type,
    status,
    COUNT(*) as count,
    AVG(attempts) as avg_attempts
FROM processing_queue
GROUP BY process_type, status
ORDER BY process_type, status;

-- View: Document statistics
CREATE OR REPLACE VIEW v_document_stats AS
SELECT
    COUNT(DISTINCT d.document_id) as total_documents,
    COUNT(DISTINCT c.chunk_id) as total_chunks,
    COUNT(DISTINCT a.author_id) as total_authors,
    SUM(d.word_count) as total_words,
    COUNT(DISTINCT d.document_id) FILTER (WHERE d.processing_status = 'completed') as processed_documents,
    COUNT(DISTINCT d.document_id) FILTER (WHERE d.processing_status = 'pending') as pending_documents
FROM documents d
LEFT JOIN chunks c ON d.document_id = c.document_id
LEFT JOIN authors a ON d.author_id = a.author_id;

-- =============================================================================
-- HELPER FUNCTIONS
-- =============================================================================

-- Function: Update timestamp trigger
CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = CURRENT_TIMESTAMP;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

-- Apply trigger to relevant tables
CREATE TRIGGER tr_documents_updated_at BEFORE UPDATE ON documents
    FOR EACH ROW EXECUTE FUNCTION update_updated_at();

CREATE TRIGGER tr_authors_updated_at BEFORE UPDATE ON authors
    FOR EACH ROW EXECUTE FUNCTION update_updated_at();

CREATE TRIGGER tr_book_projects_updated_at BEFORE UPDATE ON book_projects
    FOR EACH ROW EXECUTE FUNCTION update_updated_at();

CREATE TRIGGER tr_book_chapters_updated_at BEFORE UPDATE ON book_chapters
    FOR EACH ROW EXECUTE FUNCTION update_updated_at();

-- Function: Generate document ID
CREATE OR REPLACE FUNCTION generate_document_id(p_prefix VARCHAR, p_sequence INTEGER, p_language VARCHAR DEFAULT 'EN')
RETURNS VARCHAR AS $$
BEGIN
    RETURN UPPER(p_prefix) || '_' || LPAD(p_sequence::TEXT, 3, '0') || '_' || UPPER(p_language);
END;
$$ LANGUAGE plpgsql;

-- Function: Get chunk context (previous and next chunks)
CREATE OR REPLACE FUNCTION get_chunk_context(p_chunk_id VARCHAR, p_context_size INTEGER DEFAULT 1)
RETURNS TABLE (
    chunk_id VARCHAR,
    chunk_sequence INTEGER,
    chunk_text TEXT,
    position VARCHAR
) AS $$
DECLARE
    v_document_id VARCHAR;
    v_sequence INTEGER;
BEGIN
    -- Get current chunk info
    SELECT c.document_id, c.chunk_sequence INTO v_document_id, v_sequence
    FROM chunks c WHERE c.chunk_id = p_chunk_id;

    -- Return context chunks
    RETURN QUERY
    SELECT
        c.chunk_id,
        c.chunk_sequence,
        c.chunk_text,
        CASE
            WHEN c.chunk_sequence < v_sequence THEN 'before'
            WHEN c.chunk_sequence > v_sequence THEN 'after'
            ELSE 'current'
        END AS position
    FROM chunks c
    WHERE c.document_id = v_document_id
      AND c.chunk_sequence BETWEEN (v_sequence - p_context_size) AND (v_sequence + p_context_size)
    ORDER BY c.chunk_sequence;
END;
$$ LANGUAGE plpgsql;

-- =============================================================================
-- INITIAL DATA: Default Categories and Topics
-- =============================================================================

-- Insert default categories (customize per project)
INSERT INTO categories (name, description, sort_order) VALUES
    ('Primary Sources', 'Original texts and documents', 1),
    ('Secondary Sources', 'Scholarly analysis and commentary', 2),
    ('Reference', 'Encyclopedias, dictionaries, guides', 3),
    ('Historical', 'Historical documents and records', 4),
    ('Contemporary', 'Modern writings and analysis', 5)
ON CONFLICT (name) DO NOTHING;

-- Insert default topics (customize per project)
INSERT INTO topics (name, description) VALUES
    ('History', 'Historical events, periods, and figures'),
    ('Philosophy', 'Philosophical concepts and schools of thought'),
    ('Religion', 'Religious texts, practices, and beliefs'),
    ('Science', 'Scientific discoveries and methodologies'),
    ('Art', 'Artistic works, movements, and techniques'),
    ('Literature', 'Literary works and analysis'),
    ('Biography', 'Life stories and personal accounts'),
    ('Methodology', 'Research methods and approaches')
ON CONFLICT (name) DO NOTHING;

-- =============================================================================
-- GRANTS (adjust user as needed)
-- =============================================================================
-- GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO research_user;
-- GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO research_user;
-- GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO research_user;

COMMENT ON SCHEMA public IS 'Research Development Framework - Document Management and Semantic Search System';
