#!/usr/bin/env python3
"""
Entity Extractors for GraphRAG Enhancement

Provides pluggable entity extraction backends:
1. GLiNER (default) - Fast, lightweight NER model that runs on CPU (entities only)
2. LLaMA (optional) - Local LLM for semantic extraction (requires GPU/resources)
3. OpenAI (optional) - Cloud LLM for highest-quality relationship extraction
4. Hybrid (recommended) - GLiNER for entities + OpenAI/LLaMA for relationships

Extractor Comparison:
┌──────────────┬─────────────────────┬─────────────────────┬─────────────────────┐
│ Feature      │ GLiNER              │ LLaMA (Local)       │ OpenAI (Cloud)      │
├──────────────┼─────────────────────┼─────────────────────┼─────────────────────┤
│ Strength     │ Fast entity finding │ Privacy, free       │ Best relation qual. │
│ Weakness     │ No relationships    │ JSON output issues  │ Cost per token      │
│ Best For     │ Node identification │ Offline extraction  │ High-fidelity graph │
│ Requirements │ pip install gliner  │ GPU + GGUF model    │ OPENAI_API_KEY      │
└──────────────┴─────────────────────┴─────────────────────┴─────────────────────┘

Recommended Workflow (Hybrid):
    1. Use GLiNER to rapidly identify entities (cheap/fast)
    2. Use OpenAI only to determine relationships between those entities (high quality)

Usage:
    # Use GLiNER for entities only
    extractor = get_entity_extractor('gliner')
    entities = extractor.extract_entities(text)

    # Use OpenAI for high-quality relationship extraction
    extractor = get_entity_extractor('openai')
    triples = extractor.extract_triples(text)

    # Hybrid: GLiNER entities + OpenAI relations (recommended)
    extractor = get_entity_extractor('hybrid', relation_extractor='openai')
    result = extractor.extract(text, extract_relations=True)

    # Hybrid: GLiNER entities + LLaMA relations (privacy-focused)
    extractor = get_entity_extractor('hybrid', relation_extractor='llama')
"""

import logging
import json
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Any, Optional, Tuple, Set
from pathlib import Path

logger = logging.getLogger(__name__)

# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class ExtractedEntity:
    """Represents an extracted entity."""
    text: str                    # The entity text as found
    label: str                   # Entity type (Person, Concept, Work, etc.)
    start: int = 0               # Character start position
    end: int = 0                 # Character end position
    score: float = 1.0           # Confidence score (0-1)
    normalized: str = ""         # Normalized/canonical form

    def __post_init__(self):
        if not self.normalized:
            self.normalized = self.text.strip()

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class ExtractedTriple:
    """Represents a Subject-Predicate-Object triple."""
    subject: str
    predicate: str               # Relationship type
    object: str
    subject_type: str = ""       # Entity type of subject
    object_type: str = ""        # Entity type of object
    confidence: float = 1.0
    source_text: str = ""        # Original text this was extracted from

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


@dataclass
class ExtractionResult:
    """Results from entity/relationship extraction."""
    entities: List[ExtractedEntity] = field(default_factory=list)
    triples: List[ExtractedTriple] = field(default_factory=list)
    raw_text: str = ""
    extractor_type: str = ""
    metadata: Dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> Dict[str, Any]:
        return {
            'entities': [e.to_dict() for e in self.entities],
            'triples': [t.to_dict() for t in self.triples],
            'extractor_type': self.extractor_type,
            'metadata': self.metadata
        }


# =============================================================================
# BASE EXTRACTOR
# =============================================================================

class BaseEntityExtractor(ABC):
    """Abstract base class for entity extractors."""

    name: str = "base"

    # Default entity types for research domain
    DEFAULT_ENTITY_TYPES = [
        "Person",           # Authors, historical figures
        "Concept",          # Abstract ideas, theories
        "Work",             # Books, articles, lectures
        "Organization",     # Schools, societies, institutions
        "Location",         # Places mentioned
        "Event",            # Historical events, conferences
        "Term",             # Technical/domain-specific terms
    ]

    def __init__(self, entity_types: List[str] = None):
        """
        Initialize extractor.

        Args:
            entity_types: Custom entity types to extract
        """
        self.entity_types = entity_types or self.DEFAULT_ENTITY_TYPES
        self._initialized = False

    @abstractmethod
    def initialize(self) -> bool:
        """Initialize the model. Returns True if successful."""
        pass

    @abstractmethod
    def extract_entities(
        self,
        text: str,
        entity_types: List[str] = None
    ) -> List[ExtractedEntity]:
        """Extract named entities from text."""
        pass

    def extract_triples(self, text: str) -> List[ExtractedTriple]:
        """Extract relationship triples. Default implementation returns empty list."""
        return []

    def extract(
        self,
        text: str,
        entity_types: List[str] = None,
        extract_relations: bool = False
    ) -> ExtractionResult:
        """
        Full extraction pipeline.

        Args:
            text: Input text
            entity_types: Entity types to extract (uses defaults if None)
            extract_relations: Whether to extract relationship triples

        Returns:
            ExtractionResult with entities and optionally triples
        """
        if not self._initialized:
            if not self.initialize():
                logger.error(f"{self.name} extractor failed to initialize")
                return ExtractionResult(raw_text=text, extractor_type=self.name)

        types_to_use = entity_types or self.entity_types
        entities = self.extract_entities(text, types_to_use)

        triples = []
        if extract_relations:
            triples = self.extract_triples(text)

        return ExtractionResult(
            entities=entities,
            triples=triples,
            raw_text=text,
            extractor_type=self.name,
            metadata={'entity_types': types_to_use}
        )

    @staticmethod
    def is_available() -> bool:
        """Check if this extractor's dependencies are available."""
        return False


# =============================================================================
# GLINER EXTRACTOR (Default)
# =============================================================================

class GLiNERExtractor(BaseEntityExtractor):
    """
    GLiNER-based entity extractor.

    GLiNER (Generalist Model for Named Entity Recognition) is a compact model
    that supports zero-shot NER - it can extract any entity type without
    fine-tuning on that specific type.

    Benefits:
    - Runs efficiently on CPU
    - Small model size (~500MB)
    - Zero-shot capability for custom entity types
    - Fast inference

    Installation:
        pip install gliner
    """

    name = "gliner"

    # Model options (smallest to largest)
    MODEL_OPTIONS = {
        'small': 'urchade/gliner_small-v2.1',      # ~125MB, fastest
        'medium': 'urchade/gliner_medium-v2.1',    # ~250MB, balanced
        'large': 'urchade/gliner_large-v2.1',      # ~500MB, most accurate
        'multi': 'urchade/gliner_multi-v2.1',      # Multilingual support
    }

    def __init__(
        self,
        entity_types: List[str] = None,
        model_size: str = 'medium',
        threshold: float = 0.4,
        flat_ner: bool = True
    ):
        """
        Initialize GLiNER extractor.

        Args:
            entity_types: Entity types to extract
            model_size: One of 'small', 'medium', 'large', 'multi'
            threshold: Confidence threshold (0-1)
            flat_ner: If True, don't allow nested entities
        """
        super().__init__(entity_types)
        self.model_size = model_size
        self.model_name = self.MODEL_OPTIONS.get(model_size, self.MODEL_OPTIONS['medium'])
        self.threshold = threshold
        self.flat_ner = flat_ner
        self.model = None

    @staticmethod
    def is_available() -> bool:
        """Check if GLiNER is installed."""
        try:
            import gliner
            return True
        except ImportError:
            return False

    def initialize(self) -> bool:
        """Load the GLiNER model."""
        if self._initialized and self.model is not None:
            return True

        try:
            from gliner import GLiNER

            logger.info(f"Loading GLiNER model: {self.model_name}")
            self.model = GLiNER.from_pretrained(self.model_name)
            self._initialized = True
            logger.info(f"GLiNER model loaded successfully")
            return True

        except ImportError:
            logger.error("GLiNER not installed. Install with: pip install gliner")
            return False
        except Exception as e:
            logger.error(f"Failed to load GLiNER model: {e}")
            return False

    def extract_entities(
        self,
        text: str,
        entity_types: List[str] = None
    ) -> List[ExtractedEntity]:
        """
        Extract entities using GLiNER.

        Args:
            text: Input text
            entity_types: Entity labels to detect

        Returns:
            List of ExtractedEntity objects
        """
        if not self._initialized:
            if not self.initialize():
                return []

        types_to_use = entity_types or self.entity_types

        try:
            # GLiNER returns list of dicts with 'text', 'label', 'start', 'end', 'score'
            raw_entities = self.model.predict_entities(
                text,
                types_to_use,
                threshold=self.threshold,
                flat_ner=self.flat_ner
            )

            entities = []
            for ent in raw_entities:
                entities.append(ExtractedEntity(
                    text=ent.get('text', ''),
                    label=ent.get('label', 'Unknown'),
                    start=ent.get('start', 0),
                    end=ent.get('end', 0),
                    score=ent.get('score', 0.0)
                ))

            return entities

        except Exception as e:
            logger.error(f"GLiNER extraction failed: {e}")
            return []


# =============================================================================
# LLAMA EXTRACTOR (Optional - for users with resources)
# =============================================================================

class LLaMAExtractor(BaseEntityExtractor):
    """
    LLaMA-based entity and relationship extractor.

    Uses a local LLaMA model for richer semantic extraction including
    Subject-Predicate-Object triples. Requires more system resources
    but provides deeper understanding.

    Benefits:
    - Extracts relationships, not just entities
    - Better contextual understanding
    - Can extract complex, domain-specific patterns

    Requirements:
    - GPU with 8GB+ VRAM (or CPU with 16GB+ RAM for smaller models)
    - llama-cpp-python or transformers library

    Installation:
        pip install llama-cpp-python
        # Download a GGUF model file
    """

    name = "llama"

    # Default prompts for extraction
    ENTITY_PROMPT = """Extract named entities from the following text.
For each entity, identify its type from this list: {entity_types}

Text: {text}

Return a JSON array of objects with 'text', 'label', and 'confidence' fields.
Only return the JSON array, no other text."""

    TRIPLE_PROMPT = """Extract relationship triples (Subject-Predicate-Object) from this text.
Focus on meaningful relationships between concepts, people, works, and ideas.

Text: {text}

Return a JSON array of objects with these fields:
- subject: the subject entity
- predicate: the relationship (e.g., "wrote", "influenced", "is_part_of", "developed")
- object: the object entity
- confidence: how confident you are (0-1)

Only return the JSON array, no other text."""

    def __init__(
        self,
        entity_types: List[str] = None,
        model_path: str = None,
        n_ctx: int = 2048,
        n_gpu_layers: int = -1,  # -1 = all layers on GPU
        temperature: float = 0.1,
        use_transformers: bool = False
    ):
        """
        Initialize LLaMA extractor.

        Args:
            entity_types: Entity types to extract
            model_path: Path to GGUF model file (for llama-cpp-python)
            n_ctx: Context window size
            n_gpu_layers: Number of layers to offload to GPU (-1 for all)
            temperature: Generation temperature (lower = more deterministic)
            use_transformers: Use HuggingFace transformers instead of llama-cpp
        """
        super().__init__(entity_types)
        self.model_path = model_path
        self.n_ctx = n_ctx
        self.n_gpu_layers = n_gpu_layers
        self.temperature = temperature
        self.use_transformers = use_transformers
        self.model = None
        self.tokenizer = None

    @staticmethod
    def is_available() -> bool:
        """Check if LLaMA dependencies are available."""
        try:
            import llama_cpp
            return True
        except ImportError:
            pass

        try:
            import transformers
            return True
        except ImportError:
            pass

        return False

    def _find_model_path(self) -> Optional[str]:
        """Try to find a model file in common locations."""
        if self.model_path and Path(self.model_path).exists():
            return self.model_path

        # Common model locations
        search_paths = [
            Path.home() / '.cache' / 'llama' / 'models',
            Path.home() / 'models',
            Path('/models'),
            Path('./models'),
        ]

        # Look for GGUF files
        for search_path in search_paths:
            if search_path.exists():
                gguf_files = list(search_path.glob('**/*.gguf'))
                if gguf_files:
                    logger.info(f"Found model: {gguf_files[0]}")
                    return str(gguf_files[0])

        return None

    def initialize(self) -> bool:
        """Load the LLaMA model."""
        if self._initialized and self.model is not None:
            return True

        if self.use_transformers:
            return self._initialize_transformers()
        else:
            return self._initialize_llama_cpp()

    def _initialize_llama_cpp(self) -> bool:
        """Initialize using llama-cpp-python."""
        try:
            from llama_cpp import Llama

            model_path = self._find_model_path()
            if not model_path:
                logger.error("No LLaMA model found. Set model_path or place .gguf file in ~/models/")
                return False

            logger.info(f"Loading LLaMA model from: {model_path}")
            self.model = Llama(
                model_path=model_path,
                n_ctx=self.n_ctx,
                n_gpu_layers=self.n_gpu_layers,
                verbose=False
            )
            self._initialized = True
            logger.info("LLaMA model loaded successfully")
            return True

        except ImportError:
            logger.error("llama-cpp-python not installed. Install with: pip install llama-cpp-python")
            return False
        except Exception as e:
            logger.error(f"Failed to load LLaMA model: {e}")
            return False

    def _initialize_transformers(self) -> bool:
        """Initialize using HuggingFace transformers."""
        try:
            from transformers import AutoModelForCausalLM, AutoTokenizer
            import torch

            model_name = self.model_path or "meta-llama/Llama-2-7b-chat-hf"

            logger.info(f"Loading transformers model: {model_name}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self._initialized = True
            logger.info("Transformers model loaded successfully")
            return True

        except ImportError:
            logger.error("transformers not installed. Install with: pip install transformers torch")
            return False
        except Exception as e:
            logger.error(f"Failed to load transformers model: {e}")
            return False

    def _generate(self, prompt: str, max_tokens: int = 1024) -> str:
        """Generate text from prompt."""
        if self.use_transformers:
            return self._generate_transformers(prompt, max_tokens)
        else:
            return self._generate_llama_cpp(prompt, max_tokens)

    def _generate_llama_cpp(self, prompt: str, max_tokens: int) -> str:
        """Generate using llama-cpp-python."""
        response = self.model(
            prompt,
            max_tokens=max_tokens,
            temperature=self.temperature,
            stop=["```", "\n\n\n"]
        )
        return response['choices'][0]['text'].strip()

    def _generate_transformers(self, prompt: str, max_tokens: int) -> str:
        """Generate using transformers."""
        import torch

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=self.temperature,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def _parse_json_response(self, response: str) -> List[Dict]:
        """Parse JSON from LLM response."""
        # Try to find JSON array in response
        try:
            # Direct parse
            return json.loads(response)
        except json.JSONDecodeError:
            pass

        # Try to extract JSON from response
        json_match = re.search(r'\[[\s\S]*\]', response)
        if json_match:
            try:
                return json.loads(json_match.group())
            except json.JSONDecodeError:
                pass

        logger.warning(f"Could not parse JSON from response: {response[:200]}...")
        return []

    def extract_entities(
        self,
        text: str,
        entity_types: List[str] = None
    ) -> List[ExtractedEntity]:
        """Extract entities using LLaMA."""
        if not self._initialized:
            if not self.initialize():
                return []

        types_to_use = entity_types or self.entity_types

        # Truncate text if too long
        max_text_len = self.n_ctx - 500  # Leave room for prompt and response
        if len(text) > max_text_len:
            text = text[:max_text_len] + "..."

        prompt = self.ENTITY_PROMPT.format(
            entity_types=", ".join(types_to_use),
            text=text
        )

        try:
            response = self._generate(prompt)
            parsed = self._parse_json_response(response)

            entities = []
            for item in parsed:
                if isinstance(item, dict) and 'text' in item:
                    entities.append(ExtractedEntity(
                        text=item.get('text', ''),
                        label=item.get('label', 'Unknown'),
                        score=float(item.get('confidence', item.get('score', 0.8)))
                    ))

            return entities

        except Exception as e:
            logger.error(f"LLaMA entity extraction failed: {e}")
            return []

    def extract_triples(self, text: str) -> List[ExtractedTriple]:
        """Extract relationship triples using LLaMA."""
        if not self._initialized:
            if not self.initialize():
                return []

        # Truncate text if too long
        max_text_len = self.n_ctx - 500
        if len(text) > max_text_len:
            text = text[:max_text_len] + "..."

        prompt = self.TRIPLE_PROMPT.format(text=text)

        try:
            response = self._generate(prompt)
            parsed = self._parse_json_response(response)

            triples = []
            for item in parsed:
                if isinstance(item, dict) and all(k in item for k in ['subject', 'predicate', 'object']):
                    triples.append(ExtractedTriple(
                        subject=item['subject'],
                        predicate=item['predicate'],
                        object=item['object'],
                        confidence=float(item.get('confidence', 0.8)),
                        source_text=text[:200]
                    ))

            return triples

        except Exception as e:
            logger.error(f"LLaMA triple extraction failed: {e}")
            return []


# =============================================================================
# OPENAI EXTRACTOR (Cloud - Highest Quality)
# =============================================================================

class OpenAIExtractor(BaseEntityExtractor):
    """
    OpenAI-based entity and relationship extractor.

    Uses GPT-4o/GPT-4o-mini for highest-quality semantic triple extraction.
    This is the "gold standard" for relationship extraction accuracy.

    Benefits:
    - Excellent at following JSON output format
    - Superior understanding of complex relationships
    - Zero-shot capability for any domain
    - Reliable structured output

    Costs:
    - Requires OPENAI_API_KEY environment variable
    - Per-token pricing (~$0.01-0.03 per 1K tokens)

    Installation:
        pip install openai
        export OPENAI_API_KEY="sk-..."
    """

    name = "openai"

    # Optimized prompts for relationship extraction
    RELATION_EXTRACTION_PROMPT = """You are an expert knowledge graph builder. Extract semantic relationships from the text.

Given entities: {entities}

Text:
{text}

Extract relationships between the given entities. Return ONLY a valid JSON array.
Each relationship should have:
- "subject": the source entity (must be from the given entities list)
- "predicate": the relationship type (use: influences, supports, opposes, part_of, derived_from, created, developed, wrote, taught, related_to)
- "object": the target entity (must be from the given entities list)
- "confidence": your confidence 0.0-1.0

Example output:
[
  {{"subject": "Rudolf Steiner", "predicate": "developed", "object": "Anthroposophy", "confidence": 0.95}},
  {{"subject": "Etheric Body", "predicate": "supports", "object": "Physical Body", "confidence": 0.85}}
]

If no relationships exist between the entities, return: []

JSON array:"""

    ENTITY_EXTRACTION_PROMPT = """Extract named entities from this text. Return ONLY a valid JSON array.

Entity types to find: {entity_types}

Text:
{text}

For each entity, return:
- "text": the entity as it appears
- "label": the entity type from the list above
- "confidence": your confidence 0.0-1.0

Example:
[
  {{"text": "Rudolf Steiner", "label": "Person", "confidence": 0.98}},
  {{"text": "Anthroposophy", "label": "Concept", "confidence": 0.95}}
]

JSON array:"""

    def __init__(
        self,
        entity_types: List[str] = None,
        model: str = 'gpt-4o-mini',
        temperature: float = 0.1,
        max_tokens: int = 1000,
        api_key: str = None
    ):
        """
        Initialize OpenAI extractor.

        Args:
            entity_types: Entity types to extract
            model: OpenAI model ('gpt-4o', 'gpt-4o-mini', 'gpt-4-turbo')
            temperature: Generation temperature (lower = more deterministic)
            max_tokens: Maximum tokens for response
            api_key: OpenAI API key (or set OPENAI_API_KEY env var)
        """
        super().__init__(entity_types)
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.api_key = api_key
        self.client = None

        # Cost tracking
        self.total_input_tokens = 0
        self.total_output_tokens = 0

    @staticmethod
    def is_available() -> bool:
        """Check if OpenAI is available."""
        try:
            import openai
            import os
            # Check for API key
            return bool(os.getenv('OPENAI_API_KEY'))
        except ImportError:
            return False

    def initialize(self) -> bool:
        """Initialize the OpenAI client."""
        if self._initialized and self.client is not None:
            return True

        try:
            from openai import OpenAI
            import os

            api_key = self.api_key or os.getenv('OPENAI_API_KEY')
            if not api_key:
                logger.error("OPENAI_API_KEY not set. Export it or pass api_key parameter.")
                return False

            self.client = OpenAI(api_key=api_key)
            self._initialized = True
            logger.info(f"OpenAI client initialized with model: {self.model}")
            return True

        except ImportError:
            logger.error("openai not installed. Install with: pip install openai")
            return False
        except Exception as e:
            logger.error(f"Failed to initialize OpenAI client: {e}")
            return False

    def _call_openai(self, prompt: str, system_prompt: str = None) -> Tuple[str, Dict]:
        """Make an OpenAI API call and return response + usage stats."""
        if not self._initialized:
            if not self.initialize():
                return "", {}

        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                response_format={"type": "json_object"} if 'gpt-4' in self.model else None
            )

            # Track usage
            usage = {
                'input_tokens': response.usage.prompt_tokens,
                'output_tokens': response.usage.completion_tokens,
                'total_tokens': response.usage.total_tokens
            }
            self.total_input_tokens += usage['input_tokens']
            self.total_output_tokens += usage['output_tokens']

            return response.choices[0].message.content.strip(), usage

        except Exception as e:
            logger.error(f"OpenAI API call failed: {e}")
            return "", {}

    def _parse_json_response(self, response: str) -> List[Dict]:
        """Parse JSON from OpenAI response."""
        if not response:
            return []

        try:
            # Try direct parse
            result = json.loads(response)
            # Handle both array and object with array inside
            if isinstance(result, list):
                return result
            elif isinstance(result, dict):
                # Look for array in common keys
                for key in ['relationships', 'entities', 'triples', 'results', 'data']:
                    if key in result and isinstance(result[key], list):
                        return result[key]
                # If it's a single item, wrap it
                if 'subject' in result or 'text' in result:
                    return [result]
            return []
        except json.JSONDecodeError:
            pass

        # Try to extract JSON array from response
        json_match = re.search(r'\[[\s\S]*\]', response)
        if json_match:
            try:
                return json.loads(json_match.group())
            except json.JSONDecodeError:
                pass

        logger.warning(f"Could not parse JSON from response: {response[:200]}...")
        return []

    def extract_entities(
        self,
        text: str,
        entity_types: List[str] = None
    ) -> List[ExtractedEntity]:
        """Extract entities using OpenAI."""
        if not self._initialized:
            if not self.initialize():
                return []

        types_to_use = entity_types or self.entity_types

        # Truncate text if too long
        max_text_len = 6000  # Leave room for prompt
        if len(text) > max_text_len:
            text = text[:max_text_len] + "..."

        prompt = self.ENTITY_EXTRACTION_PROMPT.format(
            entity_types=", ".join(types_to_use),
            text=text
        )

        response, usage = self._call_openai(prompt)
        parsed = self._parse_json_response(response)

        entities = []
        for item in parsed:
            if isinstance(item, dict) and 'text' in item:
                entities.append(ExtractedEntity(
                    text=item.get('text', ''),
                    label=item.get('label', 'Unknown'),
                    score=float(item.get('confidence', 0.9))
                ))

        logger.debug(f"OpenAI extracted {len(entities)} entities (tokens: {usage.get('total_tokens', 0)})")
        return entities

    def extract_triples(
        self,
        text: str,
        known_entities: List[str] = None
    ) -> List[ExtractedTriple]:
        """
        Extract relationship triples using OpenAI.

        This is OpenAI's strength - reliable, high-quality relationship extraction.

        Args:
            text: Source text
            known_entities: Pre-identified entities (from GLiNER) to find relationships between
        """
        if not self._initialized:
            if not self.initialize():
                return []

        # Truncate text if too long
        max_text_len = 5000
        if len(text) > max_text_len:
            text = text[:max_text_len] + "..."

        # If no entities provided, we need to extract them first
        if not known_entities:
            entities = self.extract_entities(text)
            known_entities = [e.text for e in entities]

        if not known_entities:
            logger.warning("No entities found/provided for relationship extraction")
            return []

        # Limit entities to prevent token explosion
        if len(known_entities) > 20:
            known_entities = known_entities[:20]

        prompt = self.RELATION_EXTRACTION_PROMPT.format(
            entities=", ".join(known_entities),
            text=text
        )

        system_prompt = "You are a knowledge graph expert. Extract only relationships that are explicitly stated or strongly implied in the text. Be precise and conservative."

        response, usage = self._call_openai(prompt, system_prompt)
        parsed = self._parse_json_response(response)

        triples = []
        for item in parsed:
            if isinstance(item, dict) and all(k in item for k in ['subject', 'predicate', 'object']):
                # Validate that subject and object are in our entity list
                subj = item['subject']
                obj = item['object']

                # Fuzzy match to known entities
                subj_match = self._fuzzy_match(subj, known_entities)
                obj_match = self._fuzzy_match(obj, known_entities)

                if subj_match and obj_match:
                    triples.append(ExtractedTriple(
                        subject=subj_match,
                        predicate=item['predicate'],
                        object=obj_match,
                        confidence=float(item.get('confidence', 0.9)),
                        source_text=text[:200]
                    ))

        logger.debug(f"OpenAI extracted {len(triples)} relationships (tokens: {usage.get('total_tokens', 0)})")
        return triples

    def _fuzzy_match(self, text: str, candidates: List[str]) -> Optional[str]:
        """Find best matching entity from candidates."""
        text_lower = text.lower().strip()

        # Exact match
        for c in candidates:
            if c.lower().strip() == text_lower:
                return c

        # Partial match
        for c in candidates:
            if text_lower in c.lower() or c.lower() in text_lower:
                return c

        return None

    def get_usage_stats(self) -> Dict[str, Any]:
        """Get token usage statistics."""
        # Approximate costs (update as needed)
        COSTS = {
            'gpt-4o': {'input': 0.0025, 'output': 0.01},
            'gpt-4o-mini': {'input': 0.00015, 'output': 0.0006},
            'gpt-4-turbo': {'input': 0.01, 'output': 0.03},
        }

        model_costs = COSTS.get(self.model, COSTS['gpt-4o-mini'])
        estimated_cost = (
            (self.total_input_tokens / 1000) * model_costs['input'] +
            (self.total_output_tokens / 1000) * model_costs['output']
        )

        return {
            'model': self.model,
            'input_tokens': self.total_input_tokens,
            'output_tokens': self.total_output_tokens,
            'total_tokens': self.total_input_tokens + self.total_output_tokens,
            'estimated_cost_usd': round(estimated_cost, 4)
        }


# =============================================================================
# HYBRID EXTRACTOR (Recommended)
# =============================================================================

class HybridExtractor(BaseEntityExtractor):
    """
    Hybrid extractor combining GLiNER for entities with OpenAI/LLaMA for relations.

    This is the RECOMMENDED approach for high-quality knowledge graph building:
    - GLiNER: Fast, CPU-friendly entity identification (nodes)
    - OpenAI: High-fidelity relationship extraction (edges)
    - LLaMA: Privacy-focused alternative for relations

    The hybrid approach saves cost by only using expensive LLMs for relationship
    extraction, not for entity identification which GLiNER handles well.

    Usage:
        # GLiNER + OpenAI (recommended for quality)
        extractor = HybridExtractor(relation_extractor='openai')

        # GLiNER + LLaMA (for privacy/offline)
        extractor = HybridExtractor(relation_extractor='llama')

        # GLiNER only (no relations)
        extractor = HybridExtractor(relation_extractor=None)
    """

    name = "hybrid"

    def __init__(
        self,
        entity_types: List[str] = None,
        gliner_model_size: str = 'medium',
        relation_extractor: str = 'openai',  # 'openai', 'llama', or None
        openai_model: str = 'gpt-4o-mini',
        llama_model_path: str = None
    ):
        """
        Initialize hybrid extractor.

        Args:
            entity_types: Entity types to extract
            gliner_model_size: GLiNER model size ('small', 'medium', 'large')
            relation_extractor: Backend for relations ('openai', 'llama', None)
            openai_model: OpenAI model for relations
            llama_model_path: Path to LLaMA GGUF model
        """
        super().__init__(entity_types)
        self.gliner = GLiNERExtractor(entity_types, model_size=gliner_model_size)
        self.relation_backend = None
        self.relation_extractor_type = relation_extractor

        # Initialize relation extractor
        if relation_extractor == 'openai':
            if OpenAIExtractor.is_available():
                self.relation_backend = OpenAIExtractor(entity_types, model=openai_model)
                logger.info("Hybrid mode: GLiNER (entities) + OpenAI (relations)")
            else:
                logger.warning("OpenAI not available for relations. Set OPENAI_API_KEY.")
        elif relation_extractor == 'llama':
            if LLaMAExtractor.is_available():
                self.relation_backend = LLaMAExtractor(entity_types, model_path=llama_model_path)
                logger.info("Hybrid mode: GLiNER (entities) + LLaMA (relations)")
            else:
                logger.warning("LLaMA not available for relations.")

    @staticmethod
    def is_available() -> bool:
        return GLiNERExtractor.is_available()

    def initialize(self) -> bool:
        gliner_ok = self.gliner.initialize()
        if self.relation_backend:
            self.relation_backend.initialize()  # Non-fatal if fails
        self._initialized = gliner_ok
        return gliner_ok

    def extract_entities(
        self,
        text: str,
        entity_types: List[str] = None
    ) -> List[ExtractedEntity]:
        """Extract entities using GLiNER (fast, CPU-friendly)."""
        return self.gliner.extract_entities(text, entity_types)

    def extract_triples(
        self,
        text: str,
        entities: List[ExtractedEntity] = None
    ) -> List[ExtractedTriple]:
        """
        Extract relationships using OpenAI or LLaMA.

        If entities are provided (from prior GLiNER extraction), passes them
        to the relation extractor to focus only on finding relationships.
        This saves tokens and improves accuracy.
        """
        if not self.relation_backend or not self.relation_backend._initialized:
            return []

        # Get entity names to pass to relation extractor
        if entities:
            known_entities = [e.text for e in entities]
        else:
            # Extract entities first if not provided
            entities = self.extract_entities(text)
            known_entities = [e.text for e in entities]

        # Use OpenAI's specialized method if available
        if isinstance(self.relation_backend, OpenAIExtractor):
            return self.relation_backend.extract_triples(text, known_entities=known_entities)
        else:
            return self.relation_backend.extract_triples(text)

    def extract(
        self,
        text: str,
        entity_types: List[str] = None,
        extract_relations: bool = True
    ) -> ExtractionResult:
        """
        Full hybrid extraction pipeline.

        1. Use GLiNER to find entities (cheap/fast)
        2. Use OpenAI/LLaMA to find relationships between those entities (quality)
        """
        if not self._initialized:
            if not self.initialize():
                return ExtractionResult(raw_text=text, extractor_type=self.name)

        # Step 1: Extract entities with GLiNER
        entities = self.extract_entities(text, entity_types)

        # Step 2: Extract relationships if requested and backend available
        triples = []
        if extract_relations and entities:
            triples = self.extract_triples(text, entities=entities)

        return ExtractionResult(
            entities=entities,
            triples=triples,
            raw_text=text,
            extractor_type=f"hybrid(gliner+{self.relation_extractor_type or 'none'})",
            metadata={
                'entity_count': len(entities),
                'relation_count': len(triples),
                'relation_backend': self.relation_extractor_type
            }
        )

    def get_usage_stats(self) -> Dict[str, Any]:
        """Get usage statistics (mainly for OpenAI costs)."""
        if isinstance(self.relation_backend, OpenAIExtractor):
            return self.relation_backend.get_usage_stats()
        return {'relation_backend': self.relation_extractor_type, 'cost': 0}


# =============================================================================
# FACTORY & UTILITIES
# =============================================================================

class ExtractorNotConfiguredError(Exception):
    """Raised when entity extraction hasn't been configured by the user."""
    pass


class ExtractorNotAvailableError(Exception):
    """Raised when the configured extractor is not available."""
    pass


def get_entity_extractor(
    extractor_type: str = None,
    check_configured: bool = True,
    **kwargs
) -> BaseEntityExtractor:
    """
    Factory function to get an entity extractor.

    Args:
        extractor_type: One of 'gliner', 'llama', 'openai', 'hybrid', or None for auto-detect
        check_configured: If True, raises ExtractorNotConfiguredError if not configured
        **kwargs: Additional arguments passed to extractor constructor

    Returns:
        Configured entity extractor

    Raises:
        ExtractorNotConfiguredError: If check_configured=True and entity extraction
            hasn't been explicitly configured by the user
        ExtractorNotAvailableError: If the configured provider is not available

    Examples:
        # Basic extractors
        extractor = get_entity_extractor('gliner')
        extractor = get_entity_extractor('openai', model='gpt-4o')
        extractor = get_entity_extractor('llama')

        # Hybrid (recommended)
        extractor = get_entity_extractor('hybrid', relation_extractor='openai')
        extractor = get_entity_extractor('hybrid', relation_extractor='llama')

        # Skip configuration check (for internal use)
        extractor = get_entity_extractor(check_configured=False)
    """
    # Load configuration
    try:
        from config import ENTITY_EXTRACTION_CONFIG, is_entity_extraction_configured
    except ImportError:
        ENTITY_EXTRACTION_CONFIG = {'default_extractor': 'gliner', 'configured': False}
        is_entity_extraction_configured = lambda: False

    # Check if user has configured entity extraction
    if check_configured and not is_entity_extraction_configured():
        raise ExtractorNotConfiguredError(
            "Entity extraction has not been configured. "
            "Run: rdf config entity-extraction --provider <gliner|openai|hybrid>"
        )

    # Auto-detect if not specified
    if extractor_type is None:
        extractor_type = ENTITY_EXTRACTION_CONFIG.get('default_extractor', 'gliner')

    extractor_type = extractor_type.lower()

    # Apply config defaults to kwargs
    if extractor_type == 'gliner' and 'model_size' not in kwargs:
        gliner_config = ENTITY_EXTRACTION_CONFIG.get('gliner', {})
        kwargs.setdefault('model_size', gliner_config.get('model_size', 'medium'))
        kwargs.setdefault('threshold', gliner_config.get('threshold', 0.4))

    if extractor_type == 'openai' and 'model' not in kwargs:
        kwargs.setdefault('model', ENTITY_EXTRACTION_CONFIG.get('openai_model', 'gpt-4o-mini'))

    if extractor_type == 'hybrid':
        kwargs.setdefault('relation_extractor', ENTITY_EXTRACTION_CONFIG.get('relation_backend', 'openai'))
        kwargs.setdefault('openai_model', ENTITY_EXTRACTION_CONFIG.get('openai_model', 'gpt-4o-mini'))
        gliner_config = ENTITY_EXTRACTION_CONFIG.get('gliner', {})
        kwargs.setdefault('gliner_model_size', gliner_config.get('model_size', 'medium'))

    # Create extractor
    if extractor_type == 'gliner':
        if not GLiNERExtractor.is_available():
            raise ExtractorNotAvailableError(
                "GLiNER not available. Install with: pip install gliner"
            )
        return GLiNERExtractor(**kwargs)

    elif extractor_type == 'llama':
        if not LLaMAExtractor.is_available():
            raise ExtractorNotAvailableError(
                "LLaMA not available. Install with: pip install llama-cpp-python"
            )
        return LLaMAExtractor(**kwargs)

    elif extractor_type == 'openai':
        if not OpenAIExtractor.is_available():
            raise ExtractorNotAvailableError(
                "OpenAI not available. Install openai and set OPENAI_API_KEY in .env"
            )
        return OpenAIExtractor(**kwargs)

    elif extractor_type == 'hybrid':
        if not GLiNERExtractor.is_available():
            raise ExtractorNotAvailableError(
                "GLiNER required for hybrid mode. Install with: pip install gliner"
            )
        return HybridExtractor(**kwargs)

    else:
        raise ValueError(f"Unknown extractor type: {extractor_type}. "
                        f"Valid options: gliner, llama, openai, hybrid")


def get_extractor_or_status(**kwargs) -> tuple:
    """
    Get entity extractor or return status information for Claude Code.

    Returns:
        Tuple of (extractor, status_dict)
        - If successful: (extractor, None)
        - If configuration needed: (None, {'code': 'CONFIGURATION_REQUIRED', ...})
        - If dependency missing: (None, {'code': 'DEPENDENCY_MISSING', ...})
    """
    try:
        extractor = get_entity_extractor(**kwargs)
        return (extractor, None)
    except ExtractorNotConfiguredError as e:
        return (None, {
            'status': 'error',
            'code': 'CONFIGURATION_REQUIRED',
            'message': str(e),
            'actionable_advice': 'Run: rdf config entity-extraction',
            'decision_packet': {
                'decision_id': 'entity_extraction_provider',
                'question': 'Which entity extraction provider should be used?',
                'options': [
                    {'id': 'gliner', 'label': 'GLiNER (Fast/Free)', 'description': 'CPU-based, no API costs'},
                    {'id': 'openai', 'label': 'OpenAI (Quality)', 'description': 'Best quality, ~$0.01/1K tokens'},
                    {'id': 'hybrid', 'label': 'Hybrid (Recommended)', 'description': 'GLiNER + OpenAI relations'}
                ],
                'default': 'hybrid'
            }
        })
    except ExtractorNotAvailableError as e:
        return (None, {
            'status': 'error',
            'code': 'DEPENDENCY_MISSING',
            'message': str(e),
            'actionable_advice': str(e).split('. ')[-1] if '. ' in str(e) else None
        })


def check_extractor_availability() -> Dict[str, Any]:
    """Check which extractors are available and their capabilities."""
    import os

    gliner_available = GLiNERExtractor.is_available()
    llama_available = LLaMAExtractor.is_available()
    openai_available = OpenAIExtractor.is_available()

    return {
        'gliner': {
            'available': gliner_available,
            'capabilities': ['entities'],
            'install': 'pip install gliner'
        },
        'llama': {
            'available': llama_available,
            'capabilities': ['entities', 'relations'],
            'install': 'pip install llama-cpp-python'
        },
        'openai': {
            'available': openai_available,
            'capabilities': ['entities', 'relations'],
            'install': 'pip install openai && export OPENAI_API_KEY=...',
            'api_key_set': bool(os.getenv('OPENAI_API_KEY'))
        },
        'hybrid': {
            'available': gliner_available,
            'capabilities': ['entities', 'relations (with openai/llama)'],
            'recommended': True
        },
    }


# =============================================================================
# CLI INTERFACE
# =============================================================================

def main():
    """CLI for testing entity extraction."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Entity extraction for GraphRAG',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Check available extractors
    python entity_extractors.py --check

    # Extract entities with GLiNER (fast, CPU)
    python entity_extractors.py --text "Rudolf Steiner founded Anthroposophy."

    # Extract with OpenAI (highest quality)
    python entity_extractors.py --text "..." --extractor openai

    # Hybrid: GLiNER entities + OpenAI relations (RECOMMENDED)
    python entity_extractors.py --text "..." --extractor hybrid --relations

    # Hybrid with LLaMA for offline/privacy
    python entity_extractors.py --text "..." --extractor hybrid --relation-backend llama --relations

    # Extract with custom entity types
    python entity_extractors.py --text "..." --types Person Concept Work

    # Show costs after OpenAI extraction
    python entity_extractors.py --text "..." --extractor openai --show-cost
"""
    )

    parser.add_argument('--text', type=str, help='Text to extract entities from')
    parser.add_argument('--file', type=str, help='File to read text from')
    parser.add_argument('--extractor', choices=['gliner', 'llama', 'openai', 'hybrid'],
                        default='gliner', help='Extractor to use')
    parser.add_argument('--types', nargs='+', help='Entity types to extract')
    parser.add_argument('--relations', action='store_true',
                        help='Also extract relationships (best with hybrid/openai)')
    parser.add_argument('--check', action='store_true',
                        help='Check available extractors')
    parser.add_argument('--threshold', type=float, default=0.4,
                        help='Confidence threshold for GLiNER')
    parser.add_argument('--model-size', choices=['small', 'medium', 'large'],
                        default='medium', help='GLiNER model size')
    parser.add_argument('--openai-model', default='gpt-4o-mini',
                        help='OpenAI model (gpt-4o, gpt-4o-mini, gpt-4-turbo)')
    parser.add_argument('--relation-backend', choices=['openai', 'llama'],
                        default='openai', help='Backend for relations in hybrid mode')
    parser.add_argument('--show-cost', action='store_true',
                        help='Show cost estimate after OpenAI extraction')
    parser.add_argument('--json', action='store_true', help='Output as JSON')

    args = parser.parse_args()

    if args.check:
        availability = check_extractor_availability()
        print("\n" + "=" * 60)
        print("ENTITY EXTRACTOR AVAILABILITY")
        print("=" * 60)

        for name, info in availability.items():
            if isinstance(info, dict):
                status = "✓ Available" if info.get('available') else "✗ Not installed"
                caps = ", ".join(info.get('capabilities', []))
                recommended = " (RECOMMENDED)" if info.get('recommended') else ""
                print(f"\n  {name.upper()}{recommended}")
                print(f"    Status: {status}")
                print(f"    Capabilities: {caps}")
                if not info.get('available') and 'install' in info:
                    print(f"    Install: {info['install']}")
                if name == 'openai' and info.get('available'):
                    print(f"    API Key: {'Set' if info.get('api_key_set') else 'NOT SET'}")

        print("\n" + "-" * 60)
        print("RECOMMENDED: Use 'hybrid' with --relation-backend openai")
        print("  - GLiNER for fast entity extraction (free)")
        print("  - OpenAI for high-quality relationship extraction")
        print("=" * 60)
        return

    # Get text
    text = args.text
    if args.file:
        with open(args.file, 'r') as f:
            text = f.read()

    if not text:
        print("Error: Provide --text or --file")
        return

    # Get extractor with appropriate kwargs
    kwargs = {}
    if args.extractor == 'gliner':
        kwargs['model_size'] = args.model_size
        kwargs['threshold'] = args.threshold
    elif args.extractor == 'openai':
        kwargs['model'] = args.openai_model
    elif args.extractor == 'hybrid':
        kwargs['gliner_model_size'] = args.model_size
        kwargs['relation_extractor'] = args.relation_backend
        kwargs['openai_model'] = args.openai_model

    extractor = get_entity_extractor(args.extractor, **kwargs)
    if not extractor:
        print(f"Error: {args.extractor} extractor not available")
        return

    # Extract
    result = extractor.extract(
        text,
        entity_types=args.types,
        extract_relations=args.relations
    )

    if args.json:
        output = result.to_dict()
        # Add cost info if available
        if args.show_cost and hasattr(extractor, 'get_usage_stats'):
            output['usage'] = extractor.get_usage_stats()
        print(json.dumps(output, indent=2))
    else:
        print(f"\nExtractor: {result.extractor_type}")
        print("=" * 50)

        if result.entities:
            print(f"\nEntities ({len(result.entities)}):")
            for ent in result.entities:
                print(f"  [{ent.label}] {ent.text} (score: {ent.score:.2f})")
        else:
            print("\nNo entities found")

        if result.triples:
            print(f"\nRelationships ({len(result.triples)}):")
            for triple in result.triples:
                conf = f" ({triple.confidence:.0%})" if triple.confidence < 1.0 else ""
                print(f"  {triple.subject} --[{triple.predicate}]--> {triple.object}{conf}")
        elif args.relations:
            print("\nNo relationships found")

        # Show cost estimate
        if args.show_cost and hasattr(extractor, 'get_usage_stats'):
            stats = extractor.get_usage_stats()
            if stats.get('total_tokens', 0) > 0:
                print(f"\n--- Cost Estimate ---")
                print(f"  Model: {stats.get('model', 'unknown')}")
                print(f"  Tokens: {stats.get('total_tokens', 0):,}")
                print(f"  Est. Cost: ${stats.get('estimated_cost_usd', 0):.4f}")


if __name__ == '__main__':
    main()
