#!/usr/bin/env python3
"""
Auto-Glossary Generator for Research Development Framework

Uses AI (OpenAI or Ollama) to automatically generate domain-specific glossaries
from your library content. The generated glossary can then be used with
extract_concepts.py for weighted concept extraction.

Usage:
    # Generate glossary from library samples
    python auto_glossary.py --theme "Anthroposophy" --output anthro_glossary.txt

    # Use specific category filter
    python auto_glossary.py --theme "Alchemy" --category "Esotericism" -o alchemy.txt

    # Use more samples for better coverage
    python auto_glossary.py --theme "Philosophy" --samples 100 -o philosophy.txt

    # Use local Ollama instead of OpenAI
    python auto_glossary.py --theme "Mysticism" --provider ollama -o mysticism.txt
"""

import sys
import os
import argparse
import logging
import json
import random
from typing import List, Dict, Optional

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from pipeline.db_utils import execute_query
from pipeline.config import (
    OPENAI_API_KEY,
    LOCAL_LLM_MODEL,
    INTELLIGENCE_MODE
)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)


def get_random_chunks(limit: int = 50, category: str = None) -> List[Dict]:
    """
    Get random text chunks from the library for context sampling.

    Args:
        limit: Number of chunks to retrieve
        category: Optional category filter

    Returns:
        List of chunk dictionaries with chunk_text
    """
    if category:
        query = """
            SELECT c.chunk_text, d.title, d.primary_category
            FROM chunks c
            JOIN documents d ON c.document_id = d.document_id
            WHERE d.primary_category ILIKE %s
            ORDER BY RANDOM()
            LIMIT %s
        """
        return execute_query(query, (f"%{category}%", limit), fetch='all')
    else:
        query = """
            SELECT c.chunk_text, d.title, d.primary_category
            FROM chunks c
            JOIN documents d ON c.document_id = d.document_id
            ORDER BY RANDOM()
            LIMIT %s
        """
        return execute_query(query, (limit,), fetch='all')


def generate_glossary_prompt(theme: str, text_samples: str, num_terms: int = 25) -> str:
    """Generate the prompt for the LLM to create a glossary."""
    return f"""You are an expert researcher specializing in {theme}.

Analyze the following text excerpts from a research library:

---
{text_samples}
---

Task: Identify the top {num_terms} most significant technical terms, proper nouns,
or specialized concepts related specifically to '{theme}' found in or relevant to this text.

For each term, assign a 'weight' from 1.5 to 3.0 based on its importance to the field:
- 3.0 = Core foundational concept (e.g., "Anthroposophy" for Steiner studies)
- 2.5 = Important technical term (e.g., "Etheric Body", "Astral Body")
- 2.0 = Significant concept (e.g., "Initiation", "Higher Worlds")
- 1.5 = Relevant term (e.g., "Meditation", "Consciousness")

Output ONLY in this exact format, one term per line:
Term:Weight

Example output:
Etheric Body:2.5
Goethean Science:2.0
Spiritual Hierarchies:2.5

Important:
- Include multi-word terms where appropriate (e.g., "Sensory-free Thinking")
- Include proper nouns specific to the domain (e.g., "Ahriman", "Lucifer")
- Focus on terms that distinguish this field from general knowledge
- Do not include common words or generic academic terms
- Output ONLY the term:weight pairs, no explanations or numbering"""


def call_openai(system_prompt: str, user_prompt: str, model: str = "gpt-4o-mini") -> str:
    """Call OpenAI API for glossary generation."""
    try:
        import openai
        client = openai.OpenAI(api_key=OPENAI_API_KEY)

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.3,
            max_tokens=1500
        )

        return response.choices[0].message.content
    except ImportError:
        logger.error("OpenAI library not installed. Run: pip install openai")
        raise
    except Exception as e:
        logger.error(f"OpenAI API error: {e}")
        raise


def call_ollama(system_prompt: str, user_prompt: str, model: str = None) -> str:
    """Call Ollama API for glossary generation."""
    import requests

    model = model or LOCAL_LLM_MODEL or "llama3"
    url = "http://localhost:11434/api/generate"

    prompt = f"{system_prompt}\n\n{user_prompt}"

    try:
        response = requests.post(
            url,
            json={
                "model": model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.3,
                    "num_predict": 1500
                }
            },
            timeout=120
        )
        response.raise_for_status()
        return response.json().get("response", "")
    except requests.exceptions.ConnectionError:
        logger.error("Cannot connect to Ollama. Is it running? Try: ollama serve")
        raise
    except Exception as e:
        logger.error(f"Ollama API error: {e}")
        raise


def parse_glossary_response(response: str) -> List[tuple]:
    """
    Parse LLM response into glossary entries.

    Returns list of (term, weight) tuples.
    """
    entries = []

    for line in response.strip().split('\n'):
        line = line.strip()

        # Skip empty lines and comments
        if not line or line.startswith('#'):
            continue

        # Remove any numbering or bullet points
        line = line.lstrip('0123456789.-) ')

        # Parse term:weight format
        if ':' in line:
            parts = line.rsplit(':', 1)
            term = parts[0].strip()

            try:
                weight = float(parts[1].strip())
                # Clamp weight to valid range
                weight = max(1.0, min(3.0, weight))
            except ValueError:
                weight = 1.5

            if term and len(term) > 1:
                entries.append((term, weight))

    return entries


def generate_glossary(
    theme: str,
    output_file: str,
    provider: str = "auto",
    category: str = None,
    num_samples: int = 50,
    num_terms: int = 25,
    model: str = None
) -> int:
    """
    Generate a weighted glossary using AI.

    Args:
        theme: The research theme/domain
        output_file: Path to output glossary file
        provider: "openai", "ollama", or "auto"
        category: Optional category filter for sampling
        num_samples: Number of text chunks to sample
        num_terms: Number of glossary terms to generate
        model: Optional specific model to use

    Returns:
        Number of terms generated
    """
    # Determine provider
    if provider == "auto":
        if INTELLIGENCE_MODE == "cloud" and OPENAI_API_KEY:
            provider = "openai"
        else:
            provider = "ollama"

    logger.info(f"Using provider: {provider}")

    # Get sample text from library
    logger.info(f"Sampling {num_samples} chunks from library...")
    chunks = get_random_chunks(limit=num_samples, category=category)

    if not chunks:
        logger.error("No chunks found in library. Have you run ingest and chunk?")
        return 0

    # Combine chunk texts (truncate to avoid token limits)
    text_samples = "\n\n---\n\n".join([
        f"[From: {c.get('title', 'Unknown')}]\n{c['chunk_text'][:500]}"
        for c in chunks[:30]  # Limit to 30 chunks for context
    ])

    # Truncate if too long
    max_context = 8000
    if len(text_samples) > max_context:
        text_samples = text_samples[:max_context] + "\n\n[... truncated ...]"

    logger.info(f"Generating glossary for theme: {theme}")

    # Generate prompt
    system_prompt = "You are a research terminologist specializing in esoteric and philosophical traditions."
    user_prompt = generate_glossary_prompt(theme, text_samples, num_terms)

    # Call appropriate provider
    if provider == "openai":
        response = call_openai(system_prompt, user_prompt, model or "gpt-4o-mini")
    else:
        response = call_ollama(system_prompt, user_prompt, model)

    # Parse response
    entries = parse_glossary_response(response)

    if not entries:
        logger.error("Failed to parse any glossary entries from response")
        logger.debug(f"Raw response: {response}")
        return 0

    # Sort by weight (highest first)
    entries.sort(key=lambda x: x[1], reverse=True)

    # Write output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"# Auto-generated glossary for: {theme}\n")
        f.write(f"# Generated by auto_glossary.py using {provider}\n")
        f.write(f"# Samples: {len(chunks)}, Terms: {len(entries)}\n")
        if category:
            f.write(f"# Category filter: {category}\n")
        f.write("#\n")
        f.write("# Format: Term:Weight (1.5-3.0)\n")
        f.write("# 3.0 = Core concept, 2.5 = Important, 2.0 = Significant, 1.5 = Relevant\n")
        f.write("#\n\n")

        for term, weight in entries:
            f.write(f"{term}:{weight}\n")

    logger.info(f"Generated {len(entries)} glossary terms -> {output_file}")
    return len(entries)


def main():
    parser = argparse.ArgumentParser(
        description='Auto-generate weighted glossaries using AI',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Generate Anthroposophy glossary
  python auto_glossary.py --theme "Anthroposophy and Rudolf Steiner" -o anthro.txt

  # Generate from specific category
  python auto_glossary.py --theme "Alchemy" --category "Esotericism" -o alchemy.txt

  # Use Ollama instead of OpenAI
  python auto_glossary.py --theme "Mysticism" --provider ollama -o mysticism.txt

  # More samples for better coverage
  python auto_glossary.py --theme "Freemasonry" --samples 100 --terms 40 -o masonic.txt

After generating, use with extract_concepts.py:
  python extract_concepts.py --glossary anthro.txt
"""
    )

    parser.add_argument(
        '--theme', '-t',
        required=True,
        help='The research theme/domain for glossary generation'
    )
    parser.add_argument(
        '--output', '-o',
        default='glossary.txt',
        help='Output file path (default: glossary.txt)'
    )
    parser.add_argument(
        '--provider', '-p',
        choices=['auto', 'openai', 'ollama'],
        default='auto',
        help='AI provider to use (default: auto-detect)'
    )
    parser.add_argument(
        '--category', '-c',
        help='Filter library samples by category'
    )
    parser.add_argument(
        '--samples', '-s',
        type=int,
        default=50,
        help='Number of text chunks to sample (default: 50)'
    )
    parser.add_argument(
        '--terms', '-n',
        type=int,
        default=25,
        help='Number of glossary terms to generate (default: 25)'
    )
    parser.add_argument(
        '--model', '-m',
        help='Specific model to use (e.g., gpt-4o, llama3)'
    )

    args = parser.parse_args()

    try:
        count = generate_glossary(
            theme=args.theme,
            output_file=args.output,
            provider=args.provider,
            category=args.category,
            num_samples=args.samples,
            num_terms=args.terms,
            model=args.model
        )

        if count > 0:
            print(f"\nSuccess! Generated {count} terms.")
            print(f"Output: {args.output}")
            print(f"\nNext steps:")
            print(f"  1. Review and edit {args.output} as needed")
            print(f"  2. Run: python extract_concepts.py --glossary {args.output}")
            print(f"  3. Or import to database: python extract_concepts.py --import-glossary {args.output}")
        else:
            print("\nFailed to generate glossary. Check logs for details.")
            sys.exit(1)

    except Exception as e:
        logger.error(f"Error: {e}")
        sys.exit(1)


if __name__ == '__main__':
    main()
