import os
import sys
import json
from collections import Counter
from utils.chroma_db import ChromaDBManager

def get_embedding_model_info(collection_name):
    """
    Try to determine which embedding model was used for a collection
    """
    try:
        db_manager = ChromaDBManager()
        collection = db_manager.client.get_collection(
            name=collection_name,
            embedding_function=db_manager.embedding_function
        )
        
        # Try to get the first document's embedding to check dimensions
        results = collection.query(query_texts=["test"], n_results=1, include_embeddings=True)
        
        if results and "embeddings" in results and results["embeddings"] and results["embeddings"][0]:
            embedding_dim = len(results["embeddings"][0][0])
            model_info = "Unknown"
            
            # Try to guess the model based on embedding dimensions
            if embedding_dim == 384:
                model_info = "all-MiniLM-L6-v2 (384 dimensions)"
            elif embedding_dim == 768:
                model_info = "Gemini (768 dimensions)"
            else:
                model_info = f"Unknown model ({embedding_dim} dimensions)"
                
            return model_info
    except Exception as e:
        return "Could not determine embedding model"
    
    return "Unknown embedding model"

def analyze_document_sources(collection_name):
    """
    Analyze document sources in a collection and show statistics
    """
    try:
        db_manager = ChromaDBManager()
        collection = db_manager.client.get_collection(
            name=collection_name,
            embedding_function=db_manager.embedding_function
        )
        
        # Get all metadatas
        results = collection.get(include=["metadatas"])
        
        if not results or "metadatas" not in results or not results["metadatas"]:
            return {"error": "No metadata found"}
            
        metadatas = results["metadatas"]
        
        # Count documents by source
        source_counter = Counter()
        content_types = Counter()
        directories = Counter()
        
        for metadata in metadatas:
            if "source" in metadata:
                source_counter[metadata["source"]] += 1
            if "content_type" in metadata:
                content_types[metadata["content_type"]] += 1
            if "directory" in metadata:
                directories[metadata["directory"]] += 1
                
        return {
            "total_documents": len(metadatas),
            "unique_sources": len(source_counter),
            "content_types": dict(content_types),
            "top_sources": dict(source_counter.most_common(5)),
            "top_directories": dict(directories.most_common(5))
        }
    except Exception as e:
        return {"error": str(e)}

def check_collection_info(collection_name="mangoit_docs"):
    """
    Check if a collection exists and display its information
    """
    db_manager = ChromaDBManager()
    collection_info = db_manager.get_collection_info(collection_name)
    
    print(f"\n--- Collection Information for '{collection_name}' ---")
    print(f"Exists: {collection_info['exists']}")
    if collection_info['exists']:
        print(f"Document Count: {collection_info['count']}")
        
        # Get embedding model info
        model_info = get_embedding_model_info(collection_name)
        print(f"Embedding Model: {model_info}")
        
        # Analyze document sources
        source_analysis = analyze_document_sources(collection_name)
        if "error" not in source_analysis:
            print(f"Unique Sources: {source_analysis['unique_sources']}")
            
            if source_analysis.get('content_types'):
                print("\nContent Types:")
                for content_type, count in source_analysis['content_types'].items():
                    print(f"  - {content_type}: {count} documents")
            
            if source_analysis.get('top_directories'):
                print("\nTop Directories:")
                for directory, count in source_analysis['top_directories'].items():
                    print(f"  - {directory}: {count} documents")
    else:
        print(f"Error: {collection_info.get('error', 'Collection does not exist')}")
    
    return collection_info

def get_sample_documents(collection_name="mangoit_docs", n_samples=3):
    """
    Retrieve and display sample documents from the collection
    """
    if len(sys.argv) > 1:
        query = sys.argv[1]
    else:
        query = "python frameworks"  # Default query
    
    db_manager = ChromaDBManager()
    
    try:
        collection = db_manager.client.get_collection(
            name=collection_name,
            embedding_function=db_manager.embedding_function
        )
        
        # Get collection count
        count = collection.count()
        if count == 0:
            print(f"Collection '{collection_name}' exists but is empty.")
            return
        
        # Query some documents
        print(f"\n--- Sample Documents from '{collection_name}' (Query: '{query}') ---")
        results = db_manager.query_collection(query, collection_name, n_results=n_samples)
        
        # Display results
        for i, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0],
            results['distances'][0]
        )):
            print(f"\nDocument {i+1}:")
            print(f"Source: {metadata['source']}")
            print(f"Chunk: {metadata['chunk']}")
            print(f"Relevance Score: {1 - distance:.4f}")  # Convert distance to similarity score
            print(f"Content Preview: {doc[:150]}..." if len(doc) > 150 else doc)
            
    except Exception as e:
        print(f"Error retrieving documents: {str(e)}")

def list_all_collections():
    """
    List all collections in the ChromaDB
    """
    db_manager = ChromaDBManager()
    
    try:
        collections = db_manager.client.list_collections()
        print("\n--- All Collections in ChromaDB ---")
        if not collections:
            print("No collections found.")
            return
            
        for i, collection in enumerate(collections):
            print(f"{i+1}. {collection.name} (Documents: {collection.count()})")
            
    except Exception as e:
        print(f"Error listing collections: {str(e)}")

def show_help():
    print("\nUsage:")
    print(f"python {os.path.basename(__file__)} [options]")
    print("\nOptions:")
    print("  --help, -h              Show this help message")
    print("  --collection, -c NAME   Specify collection name to check")
    print("  --query, -q QUERY       Search for specific query")
    print("  --all, -a              Show detailed info for all collections")
    print("  --samples, -s N        Number of sample documents to show (default: 3)")
    print("\nExamples:")
    print(f"python {os.path.basename(__file__)} --collection mangoit_docs_miniLM")
    print(f"python {os.path.basename(__file__)} --query \"AI services\" --collection mangoit_docs_gemini")
    print(f"python {os.path.basename(__file__)} --all")

if __name__ == "__main__":
    print("ChromaDB Content Checker")
    print("=======================")
    print("")
    
    # Parse command line arguments
    import argparse
    parser = argparse.ArgumentParser(description="ChromaDB Content Checker", add_help=False)
    parser.add_argument("--help", "-h", action="store_true", help="Show help message")
    parser.add_argument("--collection", "-c", type=str, help="Collection name to check")
    parser.add_argument("--query", "-q", type=str, help="Search query")
    parser.add_argument("--all", "-a", action="store_true", help="Show all collections")
    parser.add_argument("--samples", "-s", type=int, default=3, help="Number of sample documents")
    
    # For backward compatibility, treat positional arg as query
    parser.add_argument("query_pos", nargs="?", help="Search query (positional)")
    
    args, unknown = parser.parse_known_args()
    
    if args.help:
        show_help()
        sys.exit(0)
    
    # Get query from positional or named argument
    query = args.query if args.query else args.query_pos
    
    # List all collections
    db_manager = ChromaDBManager()
    collections = db_manager.client.list_collections()
    collection_names = [collection.name for collection in collections]
    
    print(f"Found {len(collection_names)} collections in ChromaDB\n")
    list_all_collections()
    
    # If --all flag is set, check all collections
    if args.all:
        for collection_name in collection_names:
            collection_info = check_collection_info(collection_name)
            if query and collection_info['exists'] and collection_info['count'] > 0:
                print(f"\n--- Search Results for '{query}' in '{collection_name}' ---")
                get_sample_documents(collection_name, args.samples)
    # If specific collection is specified
    elif args.collection:
        collection_info = check_collection_info(args.collection)
        if query and collection_info['exists'] and collection_info['count'] > 0:
            print(f"\n--- Search Results for '{query}' in '{args.collection}' ---")
            get_sample_documents(args.collection, args.samples)
    # Default behavior: check first collection or mangoit_docs
    else:
        default_collection = collection_names[0] if collection_names else "mangoit_docs"
        collection_info = check_collection_info(default_collection)
        if query and collection_info['exists'] and collection_info['count'] > 0:
            print(f"\n--- Search Results for '{query}' in '{default_collection}' ---")
            get_sample_documents(default_collection, args.samples)
    
    print("\nTip: Run with --help for more options")

