-- =============================================================================
-- RESEARCH DEVELOPMENT FRAMEWORK v3.0 - Schema Enhancements
-- =============================================================================
-- Based on comprehensive analysis recommendations:
-- 1. Page number mapping for PDF citations
-- 2. Footnote/endnote preservation
-- 3. Knowledge Graph (entity relationships)
-- 4. Image/diagram descriptions
-- 5. Citation metadata (BibTeX)
-- =============================================================================

-- Check if columns already exist before adding
DO $$
BEGIN
    -- ==========================================================================
    -- 1. PAGE NUMBER MAPPING
    -- ==========================================================================
    -- Add page mapping to chunks for precise citations
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'chunks' AND column_name = 'page_start') THEN
        ALTER TABLE chunks ADD COLUMN page_start INTEGER;
    END IF;

    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'chunks' AND column_name = 'page_end') THEN
        ALTER TABLE chunks ADD COLUMN page_end INTEGER;
    END IF;

    -- Add source page info for citations
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'chunks' AND column_name = 'source_location') THEN
        ALTER TABLE chunks ADD COLUMN source_location JSONB DEFAULT '{}';
        -- Structure: {"page": 45, "paragraph": 3, "section": "Chapter 2"}
    END IF;

    -- ==========================================================================
    -- 2. FOOTNOTE/ENDNOTE PRESERVATION
    -- ==========================================================================
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'chunks' AND column_name = 'has_footnotes') THEN
        ALTER TABLE chunks ADD COLUMN has_footnotes BOOLEAN DEFAULT FALSE;
    END IF;

    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'chunks' AND column_name = 'footnote_refs') THEN
        ALTER TABLE chunks ADD COLUMN footnote_refs JSONB DEFAULT '[]';
        -- Structure: [{"marker": "1", "text": "See Steiner, GA 4, p.23", "inline": true}]
    END IF;

END $$;

-- =============================================================================
-- 3. KNOWLEDGE GRAPH TABLES
-- =============================================================================

-- Entity types for the knowledge graph
CREATE TABLE IF NOT EXISTS entity_types (
    type_id SERIAL PRIMARY KEY,
    name VARCHAR(50) NOT NULL UNIQUE,
    description TEXT,
    color VARCHAR(7) DEFAULT '#808080',  -- Hex color for visualization
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Insert default entity types
INSERT INTO entity_types (name, description, color) VALUES
    ('person', 'Historical or contemporary person', '#4CAF50'),
    ('concept', 'Abstract idea or philosophical concept', '#2196F3'),
    ('work', 'Book, lecture series, or published work', '#FF9800'),
    ('organization', 'Institution, society, or group', '#9C27B0'),
    ('place', 'Geographic location', '#795548'),
    ('event', 'Historical event or occurrence', '#F44336'),
    ('term', 'Technical term or definition', '#00BCD4')
ON CONFLICT (name) DO NOTHING;

-- Named entities extracted from documents
CREATE TABLE IF NOT EXISTS entities (
    entity_id SERIAL PRIMARY KEY,
    name VARCHAR(255) NOT NULL,
    name_normalized VARCHAR(255),
    type_id INTEGER REFERENCES entity_types(type_id),
    description TEXT,
    aliases TEXT[] DEFAULT '{}',
    external_ids JSONB DEFAULT '{}',  -- {"wikidata": "Q12345", "ga": "GA 4"}
    metadata JSONB DEFAULT '{}',
    embedding vector(1536),           -- For entity similarity
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(name_normalized, type_id)
);

CREATE INDEX IF NOT EXISTS idx_entities_name ON entities(name);
CREATE INDEX IF NOT EXISTS idx_entities_normalized ON entities(name_normalized);
CREATE INDEX IF NOT EXISTS idx_entities_type ON entities(type_id);
CREATE INDEX IF NOT EXISTS idx_entities_name_trgm ON entities USING gin(name gin_trgm_ops);

-- Relationship types for knowledge graph
CREATE TABLE IF NOT EXISTS relationship_types (
    rel_type_id SERIAL PRIMARY KEY,
    name VARCHAR(50) NOT NULL UNIQUE,
    inverse_name VARCHAR(50),          -- "influenced" -> "influenced by"
    description TEXT,
    is_symmetric BOOLEAN DEFAULT FALSE,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Insert default relationship types
INSERT INTO relationship_types (name, inverse_name, description, is_symmetric) VALUES
    ('influenced', 'influenced_by', 'One entity influenced another', FALSE),
    ('contradicts', 'contradicted_by', 'Concepts that oppose each other', FALSE),
    ('develops', 'developed_from', 'Concept B builds upon Concept A', FALSE),
    ('authored', 'authored_by', 'Person wrote a work', FALSE),
    ('mentions', 'mentioned_in', 'Work references an entity', FALSE),
    ('related_to', 'related_to', 'General semantic relationship', TRUE),
    ('precedes', 'follows', 'Temporal or logical ordering', FALSE),
    ('part_of', 'contains', 'Hierarchical containment', FALSE),
    ('synonym_of', 'synonym_of', 'Same meaning, different term', TRUE),
    ('example_of', 'has_example', 'Instance of a concept', FALSE)
ON CONFLICT (name) DO NOTHING;

-- Knowledge graph edges (relationships between entities)
CREATE TABLE IF NOT EXISTS entity_relationships (
    rel_id SERIAL PRIMARY KEY,
    source_entity_id INTEGER NOT NULL REFERENCES entities(entity_id) ON DELETE CASCADE,
    target_entity_id INTEGER NOT NULL REFERENCES entities(entity_id) ON DELETE CASCADE,
    rel_type_id INTEGER NOT NULL REFERENCES relationship_types(rel_type_id),
    confidence FLOAT DEFAULT 1.0,      -- 0.0 to 1.0
    evidence_text TEXT,                -- Quote supporting this relationship
    source_chunk_id VARCHAR(100) REFERENCES chunks(chunk_id),
    source_document_id VARCHAR(100) REFERENCES documents(document_id),
    extraction_method VARCHAR(50),     -- 'llm', 'pattern', 'manual'
    metadata JSONB DEFAULT '{}',
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(source_entity_id, target_entity_id, rel_type_id)
);

CREATE INDEX IF NOT EXISTS idx_rel_source ON entity_relationships(source_entity_id);
CREATE INDEX IF NOT EXISTS idx_rel_target ON entity_relationships(target_entity_id);
CREATE INDEX IF NOT EXISTS idx_rel_type ON entity_relationships(rel_type_id);
CREATE INDEX IF NOT EXISTS idx_rel_document ON entity_relationships(source_document_id);

-- Entity mentions in chunks (for highlighting)
CREATE TABLE IF NOT EXISTS entity_mentions (
    mention_id SERIAL PRIMARY KEY,
    entity_id INTEGER NOT NULL REFERENCES entities(entity_id) ON DELETE CASCADE,
    chunk_id VARCHAR(100) NOT NULL REFERENCES chunks(chunk_id) ON DELETE CASCADE,
    document_id VARCHAR(100) NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
    mention_text VARCHAR(500),         -- Actual text that matched
    char_start INTEGER,                -- Position in chunk
    char_end INTEGER,
    context_before TEXT,               -- ~50 chars before
    context_after TEXT,                -- ~50 chars after
    confidence FLOAT DEFAULT 1.0,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX IF NOT EXISTS idx_mentions_entity ON entity_mentions(entity_id);
CREATE INDEX IF NOT EXISTS idx_mentions_chunk ON entity_mentions(chunk_id);
CREATE INDEX IF NOT EXISTS idx_mentions_document ON entity_mentions(document_id);

-- =============================================================================
-- 4. IMAGE/DIAGRAM DESCRIPTIONS
-- =============================================================================

CREATE TABLE IF NOT EXISTS document_images (
    image_id SERIAL PRIMARY KEY,
    document_id VARCHAR(100) NOT NULL REFERENCES documents(document_id) ON DELETE CASCADE,
    page_number INTEGER,
    image_index INTEGER,               -- Order on page
    image_type VARCHAR(50),            -- 'diagram', 'photo', 'chart', 'table', 'illustration'
    image_path VARCHAR(1000),          -- Path to extracted image file
    image_hash VARCHAR(64),            -- For deduplication
    width_px INTEGER,
    height_px INTEGER,
    description TEXT,                  -- AI-generated description
    description_model VARCHAR(100),    -- Model used for description
    alt_text VARCHAR(500),             -- Short accessibility text
    ocr_text TEXT,                     -- Any text extracted from image
    embedding vector(1536),            -- Embedding of description for search
    caption TEXT,                      -- Original caption if found
    references_text TEXT,              -- Text in document referencing this image
    metadata JSONB DEFAULT '{}',
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX IF NOT EXISTS idx_images_document ON document_images(document_id);
CREATE INDEX IF NOT EXISTS idx_images_type ON document_images(image_type);
CREATE INDEX IF NOT EXISTS idx_images_page ON document_images(document_id, page_number);

-- =============================================================================
-- 5. CITATION METADATA (BibTeX Compatible)
-- =============================================================================

-- Add citation fields to documents
DO $$
BEGIN
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'bibtex_key') THEN
        ALTER TABLE documents ADD COLUMN bibtex_key VARCHAR(100);
    END IF;

    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'bibtex_type') THEN
        ALTER TABLE documents ADD COLUMN bibtex_type VARCHAR(50) DEFAULT 'book';
        -- book, article, inproceedings, misc, phdthesis, etc.
    END IF;

    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'citation_data') THEN
        ALTER TABLE documents ADD COLUMN citation_data JSONB DEFAULT '{}';
        -- Full BibTeX fields: {"doi": "...", "isbn": "...", "journal": "...", "volume": "...", etc.}
    END IF;

    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'translator') THEN
        ALTER TABLE documents ADD COLUMN translator VARCHAR(255);
    END IF;

    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'original_title') THEN
        ALTER TABLE documents ADD COLUMN original_title VARCHAR(500);
    END IF;

    IF NOT EXISTS (SELECT 1 FROM information_schema.columns
                   WHERE table_name = 'documents' AND column_name = 'original_language') THEN
        ALTER TABLE documents ADD COLUMN original_language VARCHAR(10);
    END IF;
END $$;

CREATE INDEX IF NOT EXISTS idx_documents_bibtex ON documents(bibtex_key);

-- =============================================================================
-- HELPER VIEWS
-- =============================================================================

-- View for knowledge graph traversal
CREATE OR REPLACE VIEW knowledge_graph AS
SELECT
    er.rel_id,
    se.name as source_name,
    se.entity_id as source_id,
    st.name as source_type,
    rt.name as relationship,
    rt.inverse_name as inverse_relationship,
    te.name as target_name,
    te.entity_id as target_id,
    tt.name as target_type,
    er.confidence,
    er.evidence_text,
    er.source_document_id,
    d.title as document_title
FROM entity_relationships er
JOIN entities se ON er.source_entity_id = se.entity_id
JOIN entities te ON er.target_entity_id = te.entity_id
JOIN relationship_types rt ON er.rel_type_id = rt.rel_type_id
JOIN entity_types st ON se.type_id = st.type_id
JOIN entity_types tt ON te.type_id = tt.type_id
LEFT JOIN documents d ON er.source_document_id = d.document_id;

-- View for chunk citations with page numbers
CREATE OR REPLACE VIEW chunk_citations AS
SELECT
    c.chunk_id,
    c.document_id,
    d.title,
    d.bibtex_key,
    COALESCE(d.author_id::text, 'Unknown') as author,
    d.publication_year,
    c.page_start,
    c.page_end,
    c.source_location,
    CASE
        WHEN c.page_start IS NOT NULL AND c.page_end IS NOT NULL AND c.page_start = c.page_end
        THEN format('%s, p. %s', d.title, c.page_start)
        WHEN c.page_start IS NOT NULL AND c.page_end IS NOT NULL
        THEN format('%s, pp. %s-%s', d.title, c.page_start, c.page_end)
        ELSE d.title
    END as citation_text
FROM chunks c
JOIN documents d ON c.document_id = d.document_id;

-- View for entity co-occurrence (entities appearing in same document)
CREATE OR REPLACE VIEW entity_cooccurrence AS
SELECT
    em1.entity_id as entity1_id,
    e1.name as entity1_name,
    em2.entity_id as entity2_id,
    e2.name as entity2_name,
    em1.document_id,
    COUNT(*) as cooccurrence_count
FROM entity_mentions em1
JOIN entity_mentions em2 ON em1.document_id = em2.document_id
    AND em1.entity_id < em2.entity_id
JOIN entities e1 ON em1.entity_id = e1.entity_id
JOIN entities e2 ON em2.entity_id = e2.entity_id
GROUP BY em1.entity_id, e1.name, em2.entity_id, e2.name, em1.document_id;

-- =============================================================================
-- HELPER FUNCTIONS
-- =============================================================================

-- Function to format citation in various styles
CREATE OR REPLACE FUNCTION format_citation(
    doc_id VARCHAR(100),
    style VARCHAR(20) DEFAULT 'chicago'
) RETURNS TEXT AS $$
DECLARE
    doc RECORD;
    author_name TEXT;
    result TEXT;
BEGIN
    SELECT d.*, a.name as author_name
    INTO doc
    FROM documents d
    LEFT JOIN authors a ON d.author_id = a.author_id
    WHERE d.document_id = doc_id;

    IF NOT FOUND THEN
        RETURN NULL;
    END IF;

    author_name := COALESCE(doc.author_name, 'Unknown');

    CASE style
        WHEN 'chicago' THEN
            result := format('%s. %s. %s%s.',
                author_name,
                doc.title,
                COALESCE(doc.publisher || ', ', ''),
                COALESCE(doc.publication_year::text, 'n.d.')
            );
        WHEN 'apa' THEN
            result := format('%s (%s). %s.%s',
                author_name,
                COALESCE(doc.publication_year::text, 'n.d.'),
                doc.title,
                COALESCE(' ' || doc.publisher || '.', '')
            );
        WHEN 'mla' THEN
            result := format('%s. %s.%s %s.',
                author_name,
                doc.title,
                COALESCE(' ' || doc.publisher || ',', ''),
                COALESCE(doc.publication_year::text, 'n.d.')
            );
        ELSE
            result := format('%s - %s (%s)',
                author_name,
                doc.title,
                COALESCE(doc.publication_year::text, 'n.d.')
            );
    END CASE;

    RETURN result;
END;
$$ LANGUAGE plpgsql;

-- Function to get related entities (graph traversal)
CREATE OR REPLACE FUNCTION get_related_entities(
    p_entity_id INTEGER,
    p_max_depth INTEGER DEFAULT 2,
    p_min_confidence FLOAT DEFAULT 0.5
) RETURNS TABLE (
    entity_id INTEGER,
    name VARCHAR(255),
    entity_type VARCHAR(50),
    relationship VARCHAR(50),
    depth INTEGER,
    path INTEGER[]
) AS $$
WITH RECURSIVE entity_graph AS (
    -- Base case: the starting entity
    SELECT
        e.entity_id,
        e.name,
        et.name as entity_type,
        NULL::VARCHAR(50) as relationship,
        0 as depth,
        ARRAY[e.entity_id] as path
    FROM entities e
    JOIN entity_types et ON e.type_id = et.type_id
    WHERE e.entity_id = p_entity_id

    UNION ALL

    -- Recursive case: follow relationships
    SELECT
        e.entity_id,
        e.name,
        et.name as entity_type,
        rt.name as relationship,
        eg.depth + 1,
        eg.path || e.entity_id
    FROM entity_graph eg
    JOIN entity_relationships er ON (er.source_entity_id = eg.entity_id OR er.target_entity_id = eg.entity_id)
    JOIN entities e ON (e.entity_id = CASE
        WHEN er.source_entity_id = eg.entity_id THEN er.target_entity_id
        ELSE er.source_entity_id
    END)
    JOIN entity_types et ON e.type_id = et.type_id
    JOIN relationship_types rt ON er.rel_type_id = rt.rel_type_id
    WHERE eg.depth < p_max_depth
      AND er.confidence >= p_min_confidence
      AND NOT (e.entity_id = ANY(eg.path))  -- Prevent cycles
)
SELECT DISTINCT entity_id, name, entity_type, relationship, depth, path
FROM entity_graph
WHERE entity_id != p_entity_id
ORDER BY depth, name;
$$ LANGUAGE sql;

-- =============================================================================
-- COMMENTS
-- =============================================================================

COMMENT ON TABLE entities IS 'Named entities (people, concepts, works) for knowledge graph';
COMMENT ON TABLE entity_relationships IS 'Relationships between entities (subject-predicate-object triples)';
COMMENT ON TABLE entity_mentions IS 'Locations where entities are mentioned in text';
COMMENT ON TABLE document_images IS 'Images/diagrams extracted from documents with AI descriptions';
COMMENT ON COLUMN chunks.page_start IS 'Starting page number in source PDF';
COMMENT ON COLUMN chunks.page_end IS 'Ending page number in source PDF';
COMMENT ON COLUMN chunks.source_location IS 'Detailed source location (page, paragraph, section)';
COMMENT ON COLUMN chunks.footnote_refs IS 'Footnotes/endnotes linked to this chunk';
COMMENT ON COLUMN documents.bibtex_key IS 'BibTeX citation key (e.g., steiner1894philosophy)';
COMMENT ON COLUMN documents.citation_data IS 'Full BibTeX metadata as JSON';

COMMENT ON FUNCTION format_citation IS 'Generate formatted citation in Chicago, APA, or MLA style';
COMMENT ON FUNCTION get_related_entities IS 'Traverse knowledge graph to find related entities';
