import os
import time
from dotenv import load_dotenv
from utils.chroma_db import ChromaDBManager

# Load environment variables
load_dotenv()

# Define collection name for pages only
PAGES_COLLECTION = "mangoit_pages_only"

def reset_and_create_pages_only():
    """
    Reset and create a pages-only collection in ChromaDB
    """
    print(f"Resetting and creating pages-only collection '{PAGES_COLLECTION}'...")
    
    # Initialize ChromaDB manager
    chroma_db = ChromaDBManager()
    
    # Delete the collection if it exists
    try:
        collections = chroma_db.client.list_collections()
        collection_names = [c.name for c in collections]
        
        if PAGES_COLLECTION in collection_names:
            print(f"Deleting existing collection '{PAGES_COLLECTION}'...")
            chroma_db.client.delete_collection(PAGES_COLLECTION)
            print(f"Collection '{PAGES_COLLECTION}' deleted successfully")
    except Exception as e:
        print(f"Error checking/deleting collection: {str(e)}")
    
    # Create a new collection
    try:
        collection = chroma_db.client.create_collection(
            name=PAGES_COLLECTION,
            embedding_function=chroma_db.embedding_function
        )
        print(f"Created new collection '{PAGES_COLLECTION}'")
    except Exception as e:
        print(f"Error creating collection: {str(e)}")
        return
    
    # Clear the state file to force reprocessing all files
    state_path = os.path.join(chroma_db.persist_directory, ".md_index_state.json")
    if os.path.exists(state_path):
        os.remove(state_path)
        print("Cleared state file to force reprocessing all files")
    
    # Define directory to process - ONLY PAGES, NO POSTS
    pages_dir = "markdown-data/pages_markdown"
    
    # Define improved chunking parameters
    chunk_size = 1500  # Increased for better context
    chunk_overlap = 250  # Increased for better overlap
    
    start_time = time.time()
    
    # Process pages markdown files
    print(f"\nEmbedding pages from {pages_dir}...")
    pages_count = chroma_db.embed_markdown_files(
        markdown_dir=pages_dir,
        collection_name=PAGES_COLLECTION,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Calculate total embedded documents
    total_count = pages_count
    
    # Get collection info
    collection_info = chroma_db.get_collection_info(PAGES_COLLECTION)
    
    # Print results
    print("\n=== Embedding Results ===")
    print(f"Pages embedded: {pages_count} chunks")
    print(f"Total embedded: {total_count} chunks")
    print(f"Collection count: {collection_info.get('count', 0)} documents")
    print(f"Embedding model: {collection_info.get('embedding_model', 'all-MiniLM-L6-v2')}")
    print(f"Time taken: {time.time() - start_time:.2f} seconds")
    
    # Print content types if available
    content_types = collection_info.get('content_types', {})
    if content_types:
        print("\n=== Content Types ===")
        for content_type, count in content_types.items():
            print(f"{content_type}: {count} documents")
    
    # Print unique sources count
    unique_sources = collection_info.get('unique_sources', 0)
    print(f"\nUnique sources: {unique_sources}")
    
    # Verify the collection exists and has documents
    try:
        collection = chroma_db.client.get_collection(name=PAGES_COLLECTION)
        count = collection.count()
        print(f"\nVerification: Collection '{PAGES_COLLECTION}' exists with {count} documents")
    except Exception as e:
        print(f"\nVerification failed: {str(e)}")
    
    print("\n=== Next Steps ===")
    print("1. Update test_pages_only_rag.py to use the new collection name")
    print("2. Run test_pages_only_rag.py to test the RAG system with pages-only data")

if __name__ == "__main__":
    reset_and_create_pages_only()
