#!/usr/bin/env python3
"""
Re-OCR Document Pipeline
========================

Re-processes documents with improved OCR methods for better text extraction.
Includes workflow management for documents that fail multiple re-OCR attempts.

Supported OCR methods:
- tesseract_best: Tesseract with best quality settings (slower)
- tesseract_fast: Tesseract with fast settings
- easyocr: EasyOCR (good for multilingual)

Usage:
    # List documents
    python reocr_document.py --list-poor           # List poor quality documents
    python reocr_document.py --list-archived       # List archived (unfixable) documents
    python reocr_document.py --stats               # Show OCR workflow statistics

    # Process single document
    python reocr_document.py <doc_id> --method tesseract_best [--preprocess]

    # Batch process all poor quality documents
    python reocr_document.py --batch [--max-attempts 3]

    # Archive/restore documents
    python reocr_document.py <doc_id> --archive    # Mark as unrecoverable
    python reocr_document.py <doc_id> --restore    # Restore for retry
"""

import os
import sys
import argparse
import subprocess
import tempfile
import shutil
import signal
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from pipeline.config import get_config
from pipeline.db_utils import get_db_connection

# Try to import OCR libraries
try:
    import pytesseract
    from PIL import Image
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False

try:
    import easyocr
    EASYOCR_AVAILABLE = True
except ImportError:
    EASYOCR_AVAILABLE = False

try:
    import pdf2image
    PDF2IMAGE_AVAILABLE = True
except ImportError:
    PDF2IMAGE_AVAILABLE = False


# Default timeout per page (seconds) - can be overridden via CLI
DEFAULT_PAGE_TIMEOUT = 120  # 2 minutes per page


# =============================================================================
# TIMEOUT AND PRIORITY HELPERS
# =============================================================================

def run_with_timeout(func, args=(), kwargs=None, timeout=DEFAULT_PAGE_TIMEOUT):
    """Run a function with a timeout. Returns (result, error_message)."""
    if kwargs is None:
        kwargs = {}

    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(func, *args, **kwargs)
        try:
            result = future.result(timeout=timeout)
            return result, None
        except FuturesTimeoutError:
            return None, f"Timeout after {timeout}s"
        except Exception as e:
            return None, str(e)


def set_low_priority():
    """Set the current process to low priority (nice value 10).

    This reduces CPU contention during batch processing.
    Only works on Unix-like systems.
    """
    try:
        os.nice(10)
        return True
    except (OSError, AttributeError):
        # Windows or permission denied
        return False


# =============================================================================
# LIST/STATS FUNCTIONS
# =============================================================================

def list_poor_quality_documents():
    """List all documents with poor OCR quality."""
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT document_id, title, source_file, quality_status, quality_score,
                       reocr_attempts, reocr_last_attempt, reocr_last_method, quality_notes
                FROM documents
                WHERE quality_status IN ('poor', 'unusable')
                ORDER BY reocr_attempts ASC, quality_score ASC
            """)
            docs = cur.fetchall()

    if not docs:
        print("No poor quality documents found.")
        return

    print(f"\n{'='*90}")
    print(f"Poor Quality Documents ({len(docs)} total)")
    print(f"{'='*90}\n")

    for doc in docs:
        doc_id, title, source, status, score, attempts, last_attempt, last_method, notes = doc
        print(f"ID: {doc_id}")
        print(f"  Title: {title[:60]}{'...' if len(title) > 60 else ''}")
        print(f"  Source: {source or 'N/A'}")
        print(f"  Quality: {status} (score: {score})")
        print(f"  Re-OCR Attempts: {attempts or 0}", end='')
        if last_attempt:
            print(f" (last: {last_attempt.strftime('%Y-%m-%d')} via {last_method})")
        else:
            print()
        if notes:
            print(f"  Issues: {notes[:80]}{'...' if len(notes) > 80 else ''}")
        print()


def list_archived_documents():
    """List all archived (unfixable) documents."""
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT document_id, title, source_file, quality_score,
                       reocr_attempts, reocr_last_attempt, reocr_last_method, quality_notes
                FROM documents
                WHERE quality_status = 'archived'
                ORDER BY reocr_last_attempt DESC NULLS LAST
            """)
            docs = cur.fetchall()

    if not docs:
        print("No archived documents found.")
        return

    print(f"\n{'='*90}")
    print(f"Archived Documents ({len(docs)} total)")
    print(f"Archived = kept for metadata only, text unavailable")
    print(f"{'='*90}\n")

    for doc in docs:
        doc_id, title, source, score, attempts, last_attempt, last_method, notes = doc
        print(f"ID: {doc_id}")
        print(f"  Title: {title[:60]}{'...' if len(title) > 60 else ''}")
        print(f"  Source: {source or 'N/A'}")
        print(f"  Final Score: {score}")
        print(f"  Total Attempts: {attempts or 0}")
        if last_attempt:
            print(f"  Last Attempt: {last_attempt.strftime('%Y-%m-%d')} via {last_method}")
        if notes:
            print(f"  Notes: {notes[:80]}{'...' if len(notes) > 80 else ''}")
        print()


def show_ocr_stats():
    """Show OCR workflow statistics."""
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("SELECT * FROM v_ocr_stats")
            stats = cur.fetchone()

    if not stats:
        print("No statistics available.")
        return

    poor, unusable, archived, never, in_progress, max_reached = stats

    print(f"\n{'='*50}")
    print("OCR Workflow Statistics")
    print(f"{'='*50}")
    print(f"\nQuality Status:")
    print(f"  Poor:       {poor:3d} documents")
    print(f"  Unusable:   {unusable:3d} documents")
    print(f"  Archived:   {archived:3d} documents")
    print(f"\nWorkflow Status:")
    print(f"  Never attempted:    {never:3d} (ready for first re-OCR)")
    print(f"  In progress:        {in_progress:3d} (attempts < 3)")
    print(f"  Max attempts hit:   {max_reached:3d} (consider archiving)")
    print(f"\nTotal needing attention: {poor + unusable}")
    print()


# =============================================================================
# ARCHIVE/RESTORE FUNCTIONS
# =============================================================================

def archive_document(document_id: str) -> dict:
    """Mark a document as archived (unrecoverable OCR)."""
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            # Check document exists and current status
            cur.execute("""
                SELECT title, quality_status, reocr_attempts
                FROM documents WHERE document_id = %s
            """, (document_id,))
            doc = cur.fetchone()

            if not doc:
                return {'status': 'error', 'error': 'Document not found'}

            title, current_status, attempts = doc

            if current_status == 'archived':
                return {'status': 'error', 'error': 'Document is already archived'}

            if current_status in ('excellent', 'good', 'fair'):
                return {'status': 'error', 'error': f'Document has {current_status} quality - no need to archive'}

            # Archive the document
            cur.execute("""
                UPDATE documents
                SET quality_status = 'archived',
                    quality_notes = CONCAT(COALESCE(quality_notes, ''), ' [Archived ', NOW()::date, ']'),
                    updated_at = NOW()
                WHERE document_id = %s
            """, (document_id,))
            conn.commit()

    print(f"Archived: {title}")
    print(f"  Previous status: {current_status}")
    print(f"  Total attempts: {attempts or 0}")
    print(f"  Document kept for metadata only. Use --restore to try again.")

    return {
        'status': 'success',
        'document_id': document_id,
        'message': f'Document archived after {attempts or 0} attempts'
    }


def restore_document(document_id: str) -> dict:
    """Restore an archived document for retry."""
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            # Check document exists and is archived
            cur.execute("""
                SELECT title, quality_status, reocr_attempts
                FROM documents WHERE document_id = %s
            """, (document_id,))
            doc = cur.fetchone()

            if not doc:
                return {'status': 'error', 'error': 'Document not found'}

            title, current_status, attempts = doc

            if current_status != 'archived':
                return {'status': 'error', 'error': f'Document is not archived (status: {current_status})'}

            # Restore the document - reset to poor, reset attempts
            cur.execute("""
                UPDATE documents
                SET quality_status = 'poor',
                    reocr_attempts = 0,
                    quality_notes = CONCAT(COALESCE(quality_notes, ''), ' [Restored ', NOW()::date, ']'),
                    updated_at = NOW()
                WHERE document_id = %s
            """, (document_id,))
            conn.commit()

    print(f"Restored: {title}")
    print(f"  Status reset to: poor")
    print(f"  Attempts reset to: 0")
    print(f"  Ready for re-OCR processing.")

    return {
        'status': 'success',
        'document_id': document_id,
        'message': f'Document restored, attempts reset'
    }


# =============================================================================
# OCR PROCESSING FUNCTIONS
# =============================================================================

def find_source_pdf(document_id: str, source_file: str) -> str:
    """Find the source PDF file for a document."""
    config = get_config()
    base_path = config.get('library_path', '/var/www/html/research/Research_development/library')

    # Search locations
    search_paths = [
        os.path.join(base_path, 'NEW_DOCS', 'completed', source_file),
        os.path.join(base_path, 'ORGANIZED'),  # Will search recursively
        os.path.join(base_path, 'incoming'),
    ]

    # Direct path check
    for path in search_paths:
        if os.path.isfile(path):
            return path

    # Recursive search in ORGANIZED
    organized_path = os.path.join(base_path, 'ORGANIZED')
    if os.path.isdir(organized_path):
        for root, dirs, files in os.walk(organized_path):
            if source_file in files:
                return os.path.join(root, source_file)

    # Search by partial name
    for root, dirs, files in os.walk(base_path):
        for f in files:
            if f.endswith('.pdf') and source_file.replace('.pdf', '') in f:
                return os.path.join(root, f)

    return None


def pdf_to_images(pdf_path: str, output_dir: str, dpi: int = 300) -> list:
    """Convert PDF pages to images."""
    if not PDF2IMAGE_AVAILABLE:
        raise RuntimeError("pdf2image not installed. Run: pip install pdf2image")

    from pdf2image import convert_from_path

    print(f"Converting PDF to images at {dpi} DPI...")
    images = convert_from_path(pdf_path, dpi=dpi, fmt='png', output_folder=output_dir)

    image_paths = []
    for i, img in enumerate(images):
        img_path = os.path.join(output_dir, f'page_{i+1:04d}.png')
        img.save(img_path, 'PNG')
        image_paths.append(img_path)
        print(f"  Converted page {i+1}/{len(images)}")

    return image_paths


def preprocess_image(image_path: str) -> Image.Image:
    """Apply preprocessing to improve OCR quality."""
    from PIL import Image, ImageEnhance, ImageFilter

    img = Image.open(image_path)

    # Convert to grayscale
    if img.mode != 'L':
        img = img.convert('L')

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(1.5)

    # Sharpen
    img = img.filter(ImageFilter.SHARPEN)

    # Simple binarization (threshold)
    threshold = 150
    img = img.point(lambda p: 255 if p > threshold else 0)

    return img


def _tesseract_ocr_worker(img, config: str) -> str:
    """Internal worker function for Tesseract OCR (called in timeout wrapper)."""
    return pytesseract.image_to_string(img, config=config)


def ocr_tesseract(image_path: str, method: str = 'best', preprocess: bool = True,
                  timeout: int = DEFAULT_PAGE_TIMEOUT) -> tuple:
    """OCR using Tesseract with timeout protection.

    Args:
        image_path: Path to the image file
        method: 'best' (LSTM, slower) or 'fast' (legacy)
        preprocess: Apply image preprocessing
        timeout: Max seconds to wait for OCR (default: 120)

    Returns:
        tuple: (text, error_message) - error_message is None on success
    """
    if not TESSERACT_AVAILABLE:
        raise RuntimeError("pytesseract not installed. Run: pip install pytesseract")

    if preprocess:
        img = preprocess_image(image_path)
    else:
        img = Image.open(image_path)

    # Configure Tesseract
    if method == 'best':
        config = '--oem 1 --psm 3'  # LSTM engine, auto page segmentation
    else:  # fast
        config = '--oem 0 --psm 3'  # Legacy engine, faster

    # Run with timeout
    text, error = run_with_timeout(_tesseract_ocr_worker, args=(img, config), timeout=timeout)
    return text, error


def _easyocr_worker(reader, image_path: str) -> str:
    """Internal worker function for EasyOCR (called in timeout wrapper)."""
    results = reader.readtext(image_path)
    return '\n'.join([r[1] for r in results])


def ocr_easyocr(image_path: str, preprocess: bool = True,
                timeout: int = DEFAULT_PAGE_TIMEOUT) -> tuple:
    """OCR using EasyOCR with timeout protection.

    Args:
        image_path: Path to the image file
        preprocess: Apply image preprocessing
        timeout: Max seconds to wait for OCR (default: 120)

    Returns:
        tuple: (text, error_message) - error_message is None on success
    """
    if not EASYOCR_AVAILABLE:
        raise RuntimeError("easyocr not installed. Run: pip install easyocr")

    reader = easyocr.Reader(['en'], gpu=False)

    actual_path = image_path
    temp_path = None

    if preprocess:
        img = preprocess_image(image_path)
        temp_path = image_path + '.preprocessed.png'
        img.save(temp_path)
        actual_path = temp_path

    try:
        text, error = run_with_timeout(_easyocr_worker, args=(reader, actual_path), timeout=timeout)
        return text, error
    finally:
        if temp_path and os.path.exists(temp_path):
            os.remove(temp_path)


def calculate_text_quality(text: str) -> dict:
    """Calculate quality metrics for a text string."""
    if not text or len(text) < 10:
        return {'score': 0, 'space_density': 0, 'avg_word_len': 0, 'status': 'unusable'}

    total_chars = len(text)
    total_spaces = text.count(' ')
    space_density = (total_spaces / total_chars * 100) if total_chars > 0 else 0

    words = text.split()
    avg_word_len = sum(len(w) for w in words) / max(len(words), 1)

    # Determine quality
    if space_density >= 15 and avg_word_len < 15:
        status = 'good'
        score = 80
    elif space_density >= 10 and avg_word_len < 20:
        status = 'fair'
        score = 60
    elif space_density >= 5:
        status = 'poor'
        score = 40
    else:
        status = 'unusable'
        score = 20

    return {
        'score': score,
        'space_density': space_density,
        'avg_word_len': avg_word_len,
        'status': status,
        'char_count': total_chars
    }


def reocr_document(document_id: str, method: str = 'tesseract_best',
                   preprocess: bool = True, max_attempts: int = 3,
                   auto_archive: bool = False, force: bool = False,
                   page_timeout: int = DEFAULT_PAGE_TIMEOUT) -> dict:
    """Re-OCR a document with improved settings.

    Args:
        document_id: The document to re-OCR
        method: OCR method (tesseract_best, tesseract_fast, easyocr)
        preprocess: Apply image preprocessing
        max_attempts: Max attempts before auto-archive
        auto_archive: Auto-archive after max attempts
        force: Force update even if new OCR is worse than existing
        page_timeout: Max seconds per page before timeout (default: 120)
    """
    print(f"\n{'='*60}")
    print(f"Re-OCR Document: {document_id}")
    print(f"Method: {method}, Preprocess: {preprocess}, Timeout: {page_timeout}s/page")
    print(f"{'='*60}\n")

    # Get document info and existing text
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT title, source_file, file_path, quality_status, quality_score, reocr_attempts
                FROM documents
                WHERE document_id = %s
            """, (document_id,))
            doc = cur.fetchone()

            if not doc:
                return {'status': 'error', 'error': 'Document not found'}

            # Get existing chunks to calculate current quality
            cur.execute("""
                SELECT chunk_text FROM chunks
                WHERE document_id = %s
                ORDER BY chunk_sequence
            """, (document_id,))
            existing_chunks = cur.fetchall()

    title, source_file, file_path, quality_status, existing_score, current_attempts = doc
    current_attempts = current_attempts or 0
    existing_score = existing_score or 0

    # Calculate existing text quality
    existing_text = '\n'.join([c[0] for c in existing_chunks]) if existing_chunks else ''
    existing_quality = calculate_text_quality(existing_text)

    # Check if archived
    if quality_status == 'archived':
        return {'status': 'error', 'error': 'Document is archived. Use --restore first.'}

    print(f"Title: {title}")
    print(f"Source: {source_file}")
    print(f"Current attempts: {current_attempts}")
    print(f"\nExisting OCR Quality:")
    print(f"  Score: {existing_score} ({quality_status})")
    print(f"  Space density: {existing_quality['space_density']:.1f}%")
    print(f"  Avg word length: {existing_quality['avg_word_len']:.1f}")
    print(f"  Characters: {existing_quality['char_count']:,}")

    if not source_file:
        return {'status': 'error', 'error': 'No source file recorded for this document'}

    # Find the source PDF
    pdf_path = find_source_pdf(document_id, source_file)
    if not pdf_path:
        return {'status': 'error', 'error': f'Source PDF not found: {source_file}'}

    print(f"Found PDF: {pdf_path}")

    # Create temp directory for processing
    temp_dir = tempfile.mkdtemp(prefix='reocr_')

    try:
        # Convert PDF to images
        image_paths = pdf_to_images(pdf_path, temp_dir, dpi=300)
        print(f"Converted {len(image_paths)} pages")

        # OCR each page with timeout protection
        full_text = []
        timeout_pages = []
        error_pages = []

        for i, img_path in enumerate(image_paths):
            print(f"OCR page {i+1}/{len(image_paths)}...", end=' ')

            if method.startswith('tesseract'):
                text, error = ocr_tesseract(img_path, 'best' if 'best' in method else 'fast',
                                             preprocess, timeout=page_timeout)
            elif method == 'easyocr':
                text, error = ocr_easyocr(img_path, preprocess, timeout=page_timeout)
            else:
                text, error = ocr_tesseract(img_path, 'best', preprocess, timeout=page_timeout)

            if error:
                if 'Timeout' in error:
                    print(f"TIMEOUT ({page_timeout}s)")
                    timeout_pages.append(i + 1)
                    text = f"[Page {i+1} - OCR timeout after {page_timeout}s]"
                else:
                    print(f"ERROR: {error}")
                    error_pages.append((i + 1, error))
                    text = f"[Page {i+1} - OCR error: {error}]"
            else:
                # Basic quality check
                space_ratio = text.count(' ') / max(len(text), 1)
                print(f"({len(text)} chars, {space_ratio:.1%} spaces)")

            full_text.append(f"--- Page {i+1} ---\n{text}")

        # Report timeout/error summary
        if timeout_pages or error_pages:
            print(f"\n{'!'*40}")
            if timeout_pages:
                print(f"Pages that timed out: {timeout_pages}")
            if error_pages:
                print(f"Pages with errors: {[p[0] for p in error_pages]}")
            print(f"{'!'*40}")

        # Combine all text
        combined_text = '\n\n'.join(full_text)

        # Calculate new OCR quality
        new_quality = calculate_text_quality(combined_text)
        new_score = new_quality['score']
        new_status = new_quality['status']
        space_density = new_quality['space_density']
        avg_word_len = new_quality['avg_word_len']
        total_chars = new_quality['char_count']

        print(f"\n{'='*40}")
        print(f"New OCR Results:")
        print(f"  Total characters: {total_chars:,}")
        print(f"  Space density: {space_density:.1f}%")
        print(f"  Average word length: {avg_word_len:.1f}")
        print(f"  Quality: {new_status} (score: {new_score})")
        print(f"{'='*40}")

        # Compare with existing quality
        quality_improved = new_score > existing_quality['score']
        quality_same = new_score == existing_quality['score']

        print(f"\nQuality Comparison:")
        print(f"  Existing: {existing_quality['score']} ({existing_quality['status']})")
        print(f"  New OCR:  {new_score} ({new_status})")

        if quality_improved:
            print(f"  Result: IMPROVED (+{new_score - existing_quality['score']} points)")
        elif quality_same:
            print(f"  Result: NO CHANGE")
        else:
            print(f"  Result: WORSE ({new_score - existing_quality['score']} points)")

        # Decide whether to update
        should_update = quality_improved or force

        if not quality_improved and not force:
            print(f"\n! New OCR is not better than existing. Use --force to override.")
            # Still update attempt count but don't replace text
            new_attempts = current_attempts + 1

            with get_db_connection() as conn:
                with conn.cursor() as cur:
                    cur.execute("""
                        UPDATE documents
                        SET reocr_attempts = %s,
                            reocr_last_attempt = NOW(),
                            reocr_last_method = %s,
                            quality_notes = %s,
                            updated_at = NOW()
                        WHERE document_id = %s
                    """, (new_attempts, method,
                          f'Re-OCR #{new_attempts} with {method}: no improvement (new: {new_score}, existing: {existing_quality["score"]})',
                          document_id))
                    conn.commit()

            return {
                'status': 'skipped',
                'document_id': document_id,
                'reason': 'New OCR not better than existing',
                'existing_score': existing_quality['score'],
                'new_score': new_score,
                'attempts': new_attempts,
                'message': f'Re-OCR skipped: new quality ({new_score}) not better than existing ({existing_quality["score"]}). Use --force to override.'
            }

        # Update attempt count
        new_attempts = current_attempts + 1

        # Check if should auto-archive
        should_archive = False
        if auto_archive and new_status in ('poor', 'unusable') and new_attempts >= max_attempts:
            should_archive = True
            new_status = 'archived'
            print(f"\n! Auto-archiving after {new_attempts} failed attempts")

        # Update the markdown file
        if file_path and os.path.exists(file_path) and not should_archive:
            print(f"\nUpdating markdown file: {file_path}")
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(f"# {title}\n\n")
                f.write(f"*Re-OCR processed on {datetime.now().strftime('%Y-%m-%d %H:%M')} using {method}*\n\n")
                f.write(combined_text)

        # Update database
        print(f"\nUpdating database...")

        with get_db_connection() as conn:
            with conn.cursor() as cur:
                if not should_archive:
                    # Delete old chunks only if not archiving
                    cur.execute("DELETE FROM chunk_concepts WHERE chunk_id IN (SELECT chunk_id FROM chunks WHERE document_id = %s)", (document_id,))
                    cur.execute("DELETE FROM chunks WHERE document_id = %s", (document_id,))

                # Build quality notes
                quality_notes = f'Re-OCR #{new_attempts} with {method}: space density {space_density:.1f}%, avg word {avg_word_len:.1f}'
                if should_archive:
                    quality_notes += f' [Auto-archived after {max_attempts} attempts]'

                # Update document
                cur.execute("""
                    UPDATE documents
                    SET quality_status = %s,
                        quality_score = %s,
                        quality_notes = %s,
                        reocr_attempts = %s,
                        reocr_last_attempt = NOW(),
                        reocr_last_method = %s,
                        processing_status = CASE WHEN %s = 'archived' THEN processing_status ELSE 'pending' END,
                        updated_at = NOW()
                    WHERE document_id = %s
                """, (new_status, new_score, quality_notes, new_attempts, method, new_status, document_id))

                conn.commit()

        result = {
            'status': 'success',
            'document_id': document_id,
            'new_quality': new_status,
            'new_score': new_score,
            'attempts': new_attempts,
            'archived': should_archive,
            'total_chars': total_chars,
            'space_density': space_density,
            'avg_word_len': avg_word_len,
        }

        if should_archive:
            result['message'] = f'Document archived after {new_attempts} failed attempts.'
        elif new_status in ('good', 'fair'):
            result['message'] = f'Success! Quality improved to {new_status} (score: {new_score}). Run chunk_documents.py to re-chunk.'
        else:
            result['message'] = f'Re-OCR complete but quality still {new_status} (attempt {new_attempts}/{max_attempts}). Try different method or --archive.'

        print(f"\n{result['message']}")
        return result

    finally:
        # Cleanup temp directory
        shutil.rmtree(temp_dir, ignore_errors=True)


def batch_reocr(method: str = 'tesseract_best', preprocess: bool = True,
                max_attempts: int = 3, auto_archive: bool = True,
                force: bool = False, page_timeout: int = DEFAULT_PAGE_TIMEOUT,
                low_priority: bool = True) -> dict:
    """Batch process all poor quality documents.

    Args:
        method: OCR method to use
        preprocess: Apply image preprocessing
        max_attempts: Max re-OCR attempts per document
        auto_archive: Auto-archive after max attempts
        force: Force update even if worse quality
        page_timeout: Max seconds per page (default: 120)
        low_priority: Run with reduced CPU priority (default: True)
    """
    print(f"\n{'='*60}")
    print("Batch Re-OCR Processing")
    print(f"Method: {method}, Max Attempts: {max_attempts}")
    print(f"Auto-archive after max attempts: {auto_archive}")
    print(f"Force update even if worse: {force}")
    print(f"Page timeout: {page_timeout}s")

    # Set low priority for batch processing to reduce system impact
    if low_priority:
        if set_low_priority():
            print(f"Process priority: LOW (nice +10)")
        else:
            print(f"Process priority: NORMAL (could not set low priority)")
    else:
        print(f"Process priority: NORMAL")

    print(f"{'='*60}\n")

    # Get documents to process
    with get_db_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT document_id, title, reocr_attempts
                FROM documents
                WHERE quality_status IN ('poor', 'unusable')
                ORDER BY reocr_attempts ASC, quality_score ASC
            """)
            docs = cur.fetchall()

    if not docs:
        print("No poor quality documents to process.")
        return {'status': 'success', 'processed': 0, 'improved': 0, 'archived': 0}

    print(f"Found {len(docs)} documents to process\n")

    results = {
        'processed': 0,
        'improved': 0,
        'still_poor': 0,
        'skipped': 0,
        'archived': 0,
        'errors': 0,
        'details': []
    }

    for doc_id, title, attempts in docs:
        attempts = attempts or 0

        # Skip if already at max attempts (unless auto-archive will archive it)
        if attempts >= max_attempts and not auto_archive:
            print(f"Skipping {doc_id}: already at max attempts ({attempts})")
            continue

        print(f"\n[{results['processed'] + 1}/{len(docs)}] Processing: {title[:50]}...")

        try:
            result = reocr_document(doc_id, method, preprocess, max_attempts, auto_archive, force, page_timeout)

            if result['status'] == 'success':
                results['processed'] += 1
                if result.get('archived'):
                    results['archived'] += 1
                elif result['new_quality'] in ('good', 'fair'):
                    results['improved'] += 1
                else:
                    results['still_poor'] += 1

                results['details'].append({
                    'document_id': doc_id,
                    'title': title,
                    'result': result['new_quality'],
                    'score': result['new_score']
                })
            elif result['status'] == 'skipped':
                results['skipped'] += 1
                print(f"  Skipped: {result.get('reason', 'no improvement')}")
            else:
                results['errors'] += 1
                print(f"  Error: {result.get('error')}")

        except Exception as e:
            results['errors'] += 1
            print(f"  Exception: {str(e)}")

    # Summary
    print(f"\n{'='*60}")
    print("Batch Processing Complete")
    print(f"{'='*60}")
    print(f"  Processed:   {results['processed']}")
    print(f"  Improved:    {results['improved']} (good/fair quality)")
    print(f"  Still poor:  {results['still_poor']}")
    print(f"  Skipped:     {results['skipped']} (new OCR not better)")
    print(f"  Archived:    {results['archived']}")
    print(f"  Errors:      {results['errors']}")
    print()

    if results['improved'] > 0:
        print("Run 'python chunk_documents.py' to re-chunk improved documents.")

    return {'status': 'success', **results}


# =============================================================================
# MAIN
# =============================================================================

def main():
    parser = argparse.ArgumentParser(
        description='Re-OCR document with improved methods',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s --list-poor                    # List poor quality documents
  %(prog)s --list-archived                # List archived documents
  %(prog)s --stats                        # Show workflow statistics
  %(prog)s DOC_ID --method tesseract_best # Re-OCR single document
  %(prog)s --batch                        # Batch process all poor docs
  %(prog)s --batch --timeout 60           # Batch with 60s timeout per page
  %(prog)s --batch --no-low-priority      # Batch at normal CPU priority
  %(prog)s DOC_ID --archive               # Archive (give up on) document
  %(prog)s DOC_ID --restore               # Restore archived document

OCR Workflow:
  1. Documents with poor OCR are flagged during ingest
  2. Use --batch to attempt re-OCR on all poor documents
  3. After max attempts (default 3), documents are auto-archived
  4. Archived = kept for metadata only, excluded from search
  5. Use --restore to try again with new methods in the future

Timeout & Priority:
  - Default timeout: 120 seconds per page (prevents hangs on complex pages)
  - Batch processing runs at low CPU priority (nice +10) by default
  - Pages that timeout are marked with [OCR timeout] placeholder text
  - Use --timeout to adjust (lower = faster failure, higher = more patience)
        """
    )

    # List/stats options
    parser.add_argument('document_id', nargs='?', help='Document ID to re-process')
    parser.add_argument('--list-poor', action='store_true',
                        help='List all poor quality documents')
    parser.add_argument('--list-archived', action='store_true',
                        help='List all archived documents')
    parser.add_argument('--stats', action='store_true',
                        help='Show OCR workflow statistics')

    # Processing options
    parser.add_argument('--method', choices=['tesseract_best', 'tesseract_fast', 'easyocr'],
                        default='tesseract_best', help='OCR method to use')
    parser.add_argument('--preprocess', action='store_true', default=True,
                        help='Apply image preprocessing (default)')
    parser.add_argument('--no-preprocess', action='store_false', dest='preprocess',
                        help='Skip image preprocessing')

    # Batch processing
    parser.add_argument('--batch', action='store_true',
                        help='Batch process all poor quality documents')
    parser.add_argument('--max-attempts', type=int, default=3,
                        help='Max re-OCR attempts before auto-archive (default: 3)')
    parser.add_argument('--no-auto-archive', action='store_true',
                        help='Disable auto-archive after max attempts')

    # Archive/restore
    parser.add_argument('--archive', action='store_true',
                        help='Archive document (mark as unrecoverable)')
    parser.add_argument('--restore', action='store_true',
                        help='Restore archived document for retry')

    # Quality comparison
    parser.add_argument('--force', action='store_true',
                        help='Force update even if new OCR is worse than existing')

    # Timeout and priority options
    parser.add_argument('--timeout', type=int, default=DEFAULT_PAGE_TIMEOUT,
                        help=f'Max seconds per page before timeout (default: {DEFAULT_PAGE_TIMEOUT})')
    parser.add_argument('--no-low-priority', action='store_true',
                        help='Disable low CPU priority for batch processing')

    args = parser.parse_args()

    # List/stats commands
    if args.list_poor:
        list_poor_quality_documents()
        return

    if args.list_archived:
        list_archived_documents()
        return

    if args.stats:
        show_ocr_stats()
        return

    # Batch processing
    if args.batch:
        # Check dependencies
        if not PDF2IMAGE_AVAILABLE:
            print("ERROR: pdf2image not installed. Run: pip install pdf2image")
            print("Also requires poppler: apt install poppler-utils")
            sys.exit(1)

        if args.method.startswith('tesseract') and not TESSERACT_AVAILABLE:
            print("ERROR: pytesseract not installed. Run: pip install pytesseract")
            print("Also requires tesseract: apt install tesseract-ocr")
            sys.exit(1)

        if args.method == 'easyocr' and not EASYOCR_AVAILABLE:
            print("ERROR: easyocr not installed. Run: pip install easyocr")
            sys.exit(1)

        result = batch_reocr(
            method=args.method,
            preprocess=args.preprocess,
            max_attempts=args.max_attempts,
            auto_archive=not args.no_auto_archive,
            force=args.force,
            page_timeout=args.timeout,
            low_priority=not args.no_low_priority
        )
        sys.exit(0 if result['status'] == 'success' else 1)

    # Archive/restore commands require document_id
    if args.archive:
        if not args.document_id:
            print("ERROR: document_id required for --archive")
            sys.exit(1)
        result = archive_document(args.document_id)
        sys.exit(0 if result['status'] == 'success' else 1)

    if args.restore:
        if not args.document_id:
            print("ERROR: document_id required for --restore")
            sys.exit(1)
        result = restore_document(args.document_id)
        sys.exit(0 if result['status'] == 'success' else 1)

    # Single document processing
    if not args.document_id:
        parser.print_help()
        print("\nError: document_id is required (or use --list-poor, --batch, --stats)")
        sys.exit(1)

    # Check dependencies
    if not PDF2IMAGE_AVAILABLE:
        print("ERROR: pdf2image not installed. Run: pip install pdf2image")
        print("Also requires poppler: apt install poppler-utils")
        sys.exit(1)

    if args.method.startswith('tesseract') and not TESSERACT_AVAILABLE:
        print("ERROR: pytesseract not installed. Run: pip install pytesseract")
        print("Also requires tesseract: apt install tesseract-ocr")
        sys.exit(1)

    if args.method == 'easyocr' and not EASYOCR_AVAILABLE:
        print("ERROR: easyocr not installed. Run: pip install easyocr")
        sys.exit(1)

    result = reocr_document(
        args.document_id,
        args.method,
        args.preprocess,
        args.max_attempts,
        auto_archive=not args.no_auto_archive,
        force=args.force,
        page_timeout=args.timeout
    )

    if result['status'] == 'skipped':
        # Not an error - just means new OCR wasn't better
        print(f"\n{result.get('message', 'Skipped: new OCR not better than existing')}")
        sys.exit(0)
    elif result['status'] != 'success':
        print(f"\nERROR: {result.get('error', 'Unknown error')}")
        sys.exit(1)


if __name__ == '__main__':
    main()
