#!/usr/bin/env python3
"""
OCR Quality Assessment Script
=============================

Analyzes document text quality based on multiple metrics:
- Space density (key indicator of OCR word boundary issues)
- Average word length
- Punctuation ratio
- Gibberish character ratio
- Sentence structure quality

Quality grades:
- excellent: 90-100 score, well-formatted readable text
- good: 75-89 score, minor issues but fully usable
- fair: 50-74 score, some issues but mostly readable
- poor: 25-49 score, significant issues, limited usefulness
- unusable: 0-24 score, text extraction failed, not useful for search

Usage:
    python assess_quality.py                    # Assess all documents
    python assess_quality.py --document DOC_ID  # Assess specific document
    python assess_quality.py --dry-run          # Preview without updating
"""

import argparse
import re
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from pipeline.db_utils import get_db_connection, get_dict_cursor


def calculate_space_density(text: str) -> float:
    """
    Calculate the ratio of spaces to total characters.
    Normal English text has ~15-20% spaces.
    OCR with missing word boundaries has <5% spaces.

    Returns: percentage of spaces (0-100)
    """
    if not text:
        return 0.0
    total_chars = len(text)
    space_count = text.count(' ')
    return (space_count / total_chars) * 100 if total_chars > 0 else 0.0


def calculate_avg_word_length(text: str) -> float:
    """
    Calculate average word length.
    Normal English: 4-6 characters per word
    Bad OCR (no spaces): very long "words"

    Returns: average word length
    """
    words = text.split()
    if not words:
        return 0.0
    return sum(len(w) for w in words) / len(words)


def calculate_gibberish_ratio(text: str) -> float:
    """
    Calculate ratio of unusual/gibberish characters.
    High ratio indicates OCR artifacts.

    Returns: percentage of gibberish (0-100)
    """
    if not text:
        return 0.0
    # Characters that are unusual in normal text
    gibberish_pattern = r'[^\w\s.,;:!?\'"()\-\[\]{}@#$%&*+=/<>°©®™€£¥\n\r]'
    gibberish_chars = len(re.findall(gibberish_pattern, text))
    return (gibberish_chars / len(text)) * 100 if text else 0.0


def calculate_punctuation_ratio(text: str) -> float:
    """
    Calculate punctuation ratio.
    Normal text: 5-15% punctuation
    Too low or too high indicates problems.

    Returns: percentage of punctuation (0-100)
    """
    if not text:
        return 0.0
    punct_count = len(re.findall(r'[.,;:!?\'"()\-]', text))
    return (punct_count / len(text)) * 100 if text else 0.0


def calculate_sentence_quality(text: str) -> float:
    """
    Estimate sentence structure quality.
    Looks for proper sentence patterns with capitalization and punctuation.

    Returns: percentage of text in valid sentences (0-100)
    """
    if not text:
        return 0.0

    # Pattern for sentences: Capital letter, some words, ending punctuation
    sentence_pattern = r'[A-Z][^.!?]*[.!?]'
    sentences = re.findall(sentence_pattern, text)

    valid_sentence_chars = sum(len(s) for s in sentences)
    return (valid_sentence_chars / len(text)) * 100 if text else 0.0


def assess_document_quality(document_id: str, chunks: list) -> dict:
    """
    Assess quality of a document based on its chunks.

    Returns dict with:
        - score: 0-100 quality score
        - status: 'excellent', 'good', 'fair', 'poor', 'unusable'
        - notes: explanation of issues found
        - metrics: detailed metric values
    """
    if not chunks:
        return {
            'score': 0,
            'status': 'unusable',
            'notes': 'No text chunks found',
            'metrics': {}
        }

    # Combine all chunk text for analysis
    # Skip the first chunk if it's just metadata/header
    text_chunks = [c['chunk_text'] for c in chunks if c.get('chunk_text')]
    if not text_chunks:
        return {
            'score': 0,
            'status': 'unusable',
            'notes': 'No text content in chunks',
            'metrics': {}
        }

    combined_text = ' '.join(text_chunks)

    # Calculate metrics
    space_density = calculate_space_density(combined_text)
    avg_word_length = calculate_avg_word_length(combined_text)
    gibberish_ratio = calculate_gibberish_ratio(combined_text)
    punctuation_ratio = calculate_punctuation_ratio(combined_text)
    sentence_quality = calculate_sentence_quality(combined_text)

    metrics = {
        'space_density': round(space_density, 2),
        'avg_word_length': round(avg_word_length, 2),
        'gibberish_ratio': round(gibberish_ratio, 2),
        'punctuation_ratio': round(punctuation_ratio, 2),
        'sentence_quality': round(sentence_quality, 2),
        'total_chars': len(combined_text),
        'chunk_count': len(text_chunks)
    }

    # Calculate component scores (each 0-100)
    issues = []

    # Space density score (most important for OCR issues)
    # Normal text ~15-20% spaces, bad OCR <5%
    if space_density >= 12:
        space_score = 100
    elif space_density >= 8:
        space_score = 70
    elif space_density >= 5:
        space_score = 40
        issues.append(f'Low space density ({space_density:.1f}%) - possible word boundary issues')
    elif space_density >= 2:
        space_score = 20
        issues.append(f'Very low space density ({space_density:.1f}%) - significant word boundary issues')
    else:
        space_score = 0
        issues.append(f'Minimal spaces ({space_density:.1f}%) - text likely unusable')

    # Word length score
    # Normal avg 4-7, bad OCR has very long "words"
    if 3 <= avg_word_length <= 8:
        word_score = 100
    elif 2 <= avg_word_length <= 12:
        word_score = 70
    elif avg_word_length <= 20:
        word_score = 40
        issues.append(f'Unusual avg word length ({avg_word_length:.1f})')
    else:
        word_score = 10
        issues.append(f'Extreme avg word length ({avg_word_length:.1f}) - words not properly separated')

    # Gibberish score
    if gibberish_ratio <= 2:
        gibberish_score = 100
    elif gibberish_ratio <= 5:
        gibberish_score = 80
    elif gibberish_ratio <= 10:
        gibberish_score = 50
        issues.append(f'Elevated gibberish characters ({gibberish_ratio:.1f}%)')
    else:
        gibberish_score = 20
        issues.append(f'High gibberish ratio ({gibberish_ratio:.1f}%) - OCR artifacts')

    # Sentence quality score
    if sentence_quality >= 60:
        sentence_score = 100
    elif sentence_quality >= 40:
        sentence_score = 70
    elif sentence_quality >= 20:
        sentence_score = 40
        issues.append(f'Poor sentence structure ({sentence_quality:.1f}%)')
    else:
        sentence_score = 20
        issues.append(f'Very poor sentence structure ({sentence_quality:.1f}%)')

    # Weighted final score
    # Space density is most important for detecting OCR issues
    final_score = int(
        space_score * 0.40 +      # 40% weight on spaces
        word_score * 0.25 +       # 25% weight on word length
        gibberish_score * 0.15 +  # 15% weight on gibberish
        sentence_score * 0.20     # 20% weight on sentences
    )

    # Determine status based on score
    if final_score >= 90:
        status = 'excellent'
    elif final_score >= 75:
        status = 'good'
    elif final_score >= 50:
        status = 'fair'
    elif final_score >= 25:
        status = 'poor'
    else:
        status = 'unusable'

    notes = '; '.join(issues) if issues else 'No significant issues detected'

    return {
        'score': final_score,
        'status': status,
        'notes': notes,
        'metrics': metrics
    }


def get_document_chunks(document_id: str) -> list:
    """Fetch all chunks for a document."""
    with get_dict_cursor() as (conn, cur):
        cur.execute("""
            SELECT chunk_id, chunk_sequence, chunk_text
            FROM chunks
            WHERE document_id = %s
            ORDER BY chunk_sequence
        """, (document_id,))
        return cur.fetchall()


def get_all_documents() -> list:
    """Fetch all documents."""
    with get_dict_cursor() as (conn, cur):
        cur.execute("""
            SELECT document_id, title, quality_status, quality_score
            FROM documents
            ORDER BY title
        """)
        return cur.fetchall()


def update_document_quality(document_id: str, score: int, status: str, notes: str):
    """Update quality fields for a document."""
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                UPDATE documents
                SET quality_score = %s,
                    quality_status = %s,
                    quality_notes = %s
                WHERE document_id = %s
            """, (score, status, notes, document_id))


def main():
    parser = argparse.ArgumentParser(description='Assess OCR quality of documents')
    parser.add_argument('--document', '-d', help='Assess specific document ID')
    parser.add_argument('--dry-run', '-n', action='store_true', help='Preview without updating')
    parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed metrics')
    args = parser.parse_args()

    print("=" * 70)
    print("OCR QUALITY ASSESSMENT")
    print("=" * 70)

    if args.document:
        documents = [{'document_id': args.document, 'title': args.document}]
    else:
        documents = get_all_documents()

    print(f"\nAssessing {len(documents)} document(s)...\n")

    results_by_status = {'excellent': [], 'good': [], 'fair': [], 'poor': [], 'unusable': []}

    for doc in documents:
        doc_id = doc['document_id']
        title = doc.get('title', doc_id)[:50]

        chunks = get_document_chunks(doc_id)
        result = assess_document_quality(doc_id, chunks)

        status_icon = {
            'excellent': '✓',
            'good': '○',
            'fair': '△',
            'poor': '✗',
            'unusable': '⊘'
        }.get(result['status'], '?')

        print(f"{status_icon} [{result['score']:3d}] {result['status']:10s} | {title}")

        if args.verbose and result['metrics']:
            m = result['metrics']
            print(f"         Spaces: {m['space_density']:.1f}%  AvgWord: {m['avg_word_length']:.1f}  "
                  f"Gibberish: {m['gibberish_ratio']:.1f}%  Sentences: {m['sentence_quality']:.1f}%")

        if result['notes'] and result['status'] in ('poor', 'unusable'):
            print(f"         Issues: {result['notes']}")

        results_by_status[result['status']].append(doc_id)

        if not args.dry_run:
            update_document_quality(doc_id, result['score'], result['status'], result['notes'])

    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    for status in ['excellent', 'good', 'fair', 'poor', 'unusable']:
        count = len(results_by_status[status])
        if count > 0:
            print(f"  {status:10s}: {count} document(s)")

    if args.dry_run:
        print("\n[DRY RUN - no changes made to database]")
    else:
        print("\n[Quality scores updated in database]")


if __name__ == '__main__':
    main()
