#!/usr/bin/env python3
"""
Batch Search Tool - Run multiple search queries from a file.

Process multiple search queries in batch and aggregate results.

Usage:
    python batch_search.py queries.txt --output results/
    python batch_search.py queries.txt --format markdown --output report.md
    python batch_search.py queries.txt --combine --output combined.json
    echo -e "consciousness\\nfreedom\\nthinking" | python batch_search.py --stdin

Query File Format (queries.txt):
    # Comments start with #
    consciousness
    philosophy of freedom
    "exact phrase search"
    alchemy AND jung
"""

import argparse
import csv
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional, TextIO

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from db_utils import execute_query


def search_documents(
    query: str,
    limit: int = 50,
    category: Optional[str] = None
) -> List[Dict[str, Any]]:
    """
    Search documents and return results.

    Args:
        query: Search query string
        limit: Maximum results to return
        category: Filter by category

    Returns:
        List of search result dictionaries
    """
    base_query = """
        SELECT
            c.chunk_id,
            c.document_id,
            d.title,
            d.author,
            d.publication_year,
            d.primary_category,
            c.chunk_text,
            ts_rank(c.chunk_text_tsv, plainto_tsquery('english', %s)) as score
        FROM chunks c
        JOIN documents d ON c.document_id = d.document_id
        WHERE c.chunk_text_tsv @@ plainto_tsquery('english', %s)
          AND d.quality_status != 'archived'
    """

    params = [query, query]

    if category:
        base_query += " AND d.primary_category = %s"
        params.append(category)

    base_query += " ORDER BY score DESC LIMIT %s"
    params.append(limit)

    results = execute_query(base_query, tuple(params), fetch='all')

    formatted = []
    for r in results:
        snippet = r['chunk_text'][:250]
        if len(r['chunk_text']) > 250:
            snippet += '...'

        formatted.append({
            'chunk_id': r['chunk_id'],
            'document_id': r['document_id'],
            'title': r['title'] or 'Untitled',
            'author': r['author'] or 'Unknown',
            'year': r['publication_year'],
            'category': r['primary_category'] or 'Uncategorized',
            'snippet': snippet,
            'score': float(r['score']) if r['score'] else 0.0
        })

    return formatted


def parse_queries(source: TextIO) -> List[str]:
    """Parse queries from a file or stdin."""
    queries = []
    for line in source:
        line = line.strip()
        # Skip empty lines and comments
        if not line or line.startswith('#'):
            continue
        queries.append(line)
    return queries


def run_batch_search(
    queries: List[str],
    limit_per_query: int = 50,
    category: Optional[str] = None,
    verbose: bool = False
) -> Dict[str, Any]:
    """
    Run multiple search queries and collect results.

    Args:
        queries: List of search queries
        limit_per_query: Max results per query
        category: Filter all queries by category
        verbose: Print progress

    Returns:
        Dictionary with all results and statistics
    """
    all_results = {}
    stats = {
        'queries_run': 0,
        'total_results': 0,
        'unique_documents': set(),
        'queries_with_results': 0,
        'queries_empty': 0
    }

    for i, query in enumerate(queries, 1):
        if verbose:
            print(f"[{i}/{len(queries)}] Searching: {query}")

        results = search_documents(query, limit=limit_per_query, category=category)

        all_results[query] = {
            'query': query,
            'result_count': len(results),
            'results': results
        }

        stats['queries_run'] += 1
        stats['total_results'] += len(results)

        if results:
            stats['queries_with_results'] += 1
            for r in results:
                stats['unique_documents'].add(r['document_id'])
        else:
            stats['queries_empty'] += 1

    stats['unique_documents'] = len(stats['unique_documents'])
    return {'results': all_results, 'stats': stats}


def export_individual_files(batch_results: Dict, output_dir: Path, format: str):
    """Export each query result to a separate file."""
    output_dir.mkdir(parents=True, exist_ok=True)

    for query, data in batch_results['results'].items():
        # Create safe filename
        safe_name = ''.join(c if c.isalnum() or c in ' -_' else '_' for c in query)[:50]
        safe_name = safe_name.strip().replace(' ', '_')

        if format == 'json':
            filepath = output_dir / f"{safe_name}.json"
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

        elif format == 'csv':
            filepath = output_dir / f"{safe_name}.csv"
            with open(filepath, 'w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['Title', 'Author', 'Year', 'Category', 'Score', 'Snippet'])
                for r in data['results']:
                    writer.writerow([
                        r['title'], r['author'], r['year'],
                        r['category'], r['score'], r['snippet'][:200]
                    ])

        elif format == 'markdown':
            filepath = output_dir / f"{safe_name}.md"
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(f"# Search Results: {query}\n\n")
                f.write(f"**Results:** {data['result_count']}\n\n")
                f.write("---\n\n")
                for i, r in enumerate(data['results'], 1):
                    f.write(f"## {i}. {r['title']}\n\n")
                    f.write(f"**Author:** {r['author']} | **Year:** {r.get('year', 'N/A')}\n\n")
                    f.write(f"> {r['snippet']}\n\n")

    print(f"Exported {len(batch_results['results'])} result files to {output_dir}/")


def export_combined(batch_results: Dict, output_path: Path, format: str):
    """Export all results to a single file."""
    if format == 'json':
        with open(output_path, 'w', encoding='utf-8') as f:
            output = {
                'exported_at': datetime.now().isoformat(),
                'stats': batch_results['stats'],
                'queries': batch_results['results']
            }
            json.dump(output, f, indent=2, ensure_ascii=False)

    elif format == 'csv':
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Query', 'Title', 'Author', 'Year', 'Category', 'Score', 'Snippet', 'Document_ID'])
            for query, data in batch_results['results'].items():
                for r in data['results']:
                    writer.writerow([
                        query, r['title'], r['author'], r['year'],
                        r['category'], r['score'], r['snippet'][:200], r['document_id']
                    ])

    elif format == 'markdown':
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("# Batch Search Results\n\n")
            f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")

            stats = batch_results['stats']
            f.write("## Summary\n\n")
            f.write(f"- **Queries Run:** {stats['queries_run']}\n")
            f.write(f"- **Total Results:** {stats['total_results']}\n")
            f.write(f"- **Unique Documents:** {stats['unique_documents']}\n")
            f.write(f"- **Queries with Results:** {stats['queries_with_results']}\n")
            f.write(f"- **Empty Queries:** {stats['queries_empty']}\n\n")
            f.write("---\n\n")

            for query, data in batch_results['results'].items():
                f.write(f"## Query: `{query}`\n\n")
                f.write(f"**Results:** {data['result_count']}\n\n")

                if data['results']:
                    for i, r in enumerate(data['results'][:10], 1):  # Top 10 per query
                        f.write(f"{i}. **{r['title']}** ({r['author']}, {r.get('year', 'N/A')})\n")
                        f.write(f"   > {r['snippet'][:150]}...\n\n")
                else:
                    f.write("*No results found*\n\n")

                f.write("---\n\n")

    print(f"Exported combined results to {output_path}")


def export_summary(batch_results: Dict, output_path: Optional[Path] = None):
    """Print or save summary statistics."""
    stats = batch_results['stats']

    summary = f"""
{'='*60}
BATCH SEARCH SUMMARY
{'='*60}

Queries Run:          {stats['queries_run']}
Queries with Results: {stats['queries_with_results']}
Empty Queries:        {stats['queries_empty']}

Total Results:        {stats['total_results']}
Unique Documents:     {stats['unique_documents']}

{'='*60}

RESULTS BY QUERY:
"""

    for query, data in batch_results['results'].items():
        summary += f"\n  [{data['result_count']:3d}] {query}"

    if output_path:
        with open(output_path, 'w') as f:
            f.write(summary)
    else:
        print(summary)


def main():
    parser = argparse.ArgumentParser(
        description='Run batch search queries from file',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Basic batch search from file
  python batch_search.py queries.txt --output results/

  # Combined output as markdown report
  python batch_search.py queries.txt --combine --format markdown --output report.md

  # Read queries from stdin
  echo -e "consciousness\\nfreedom" | python batch_search.py --stdin --output results/

  # Filter by category
  python batch_search.py queries.txt --category Philosophy --output philosophy_results/

  # Just show summary
  python batch_search.py queries.txt --summary-only

Query File Format:
  # This is a comment
  consciousness
  philosophy of freedom
  "exact phrase"
  alchemy AND transmutation
        """
    )

    # Input source
    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument('queries_file', nargs='?', type=Path, help='File containing queries (one per line)')
    input_group.add_argument('--stdin', action='store_true', help='Read queries from stdin')

    # Output options
    parser.add_argument('--output', '-o', type=Path, help='Output path (file or directory)')
    parser.add_argument('--format', '-f', choices=['json', 'csv', 'markdown'], default='json',
                        help='Output format (default: json)')
    parser.add_argument('--combine', '-c', action='store_true',
                        help='Combine all results into single file (default: separate files)')

    # Search options
    parser.add_argument('--limit', '-l', type=int, default=50, help='Max results per query (default: 50)')
    parser.add_argument('--category', help='Filter all queries by category')

    # Other options
    parser.add_argument('--verbose', '-v', action='store_true', help='Show progress')
    parser.add_argument('--summary-only', action='store_true', help='Only show summary, no export')

    args = parser.parse_args()

    # Parse queries
    if args.stdin:
        queries = parse_queries(sys.stdin)
    else:
        if not args.queries_file.exists():
            print(f"File not found: {args.queries_file}")
            sys.exit(1)
        with open(args.queries_file) as f:
            queries = parse_queries(f)

    if not queries:
        print("No queries found")
        sys.exit(1)

    print(f"Loaded {len(queries)} queries")

    # Run batch search
    batch_results = run_batch_search(
        queries,
        limit_per_query=args.limit,
        category=args.category,
        verbose=args.verbose
    )

    # Export results
    if args.summary_only:
        export_summary(batch_results)
    elif args.output:
        if args.combine:
            export_combined(batch_results, args.output, args.format)
        else:
            export_individual_files(batch_results, args.output, args.format)
        export_summary(batch_results)
    else:
        # Default: print summary
        export_summary(batch_results)


if __name__ == '__main__':
    main()
