import os
import time
from dotenv import load_dotenv
from utils.chroma_db import ChromaDBManager

# Load environment variables
load_dotenv()

# Get collection name from environment variable or use default
RAG_COLLECTION = os.getenv("RAG_COLLECTION", "mangoit_docs_miniLM")

def embed_markdown_files(force_rebuild=True):
    """
    Directly embed markdown files into ChromaDB using the improved chunking parameters
    
    Args:
        force_rebuild: Whether to force rebuild the collection
    """
    print(f"Starting direct embedding of markdown files into collection '{RAG_COLLECTION}'...")
    print("Using improved chunking parameters for better context preservation")
    
    # Initialize ChromaDB manager
    chroma_db = ChromaDBManager()
    
    # Force delete and recreate collection if requested
    if force_rebuild:
        try:
            print(f"Forcing rebuild of collection '{RAG_COLLECTION}'...")
            # Try to delete the collection if it exists
            try:
                chroma_db.client.delete_collection(RAG_COLLECTION)
                print(f"Deleted existing collection '{RAG_COLLECTION}'")
            except Exception as e:
                print(f"Collection doesn't exist or couldn't be deleted: {str(e)}")
                
            # Create a new collection
            collection = chroma_db.client.create_collection(
                name=RAG_COLLECTION,
                embedding_function=chroma_db.embedding_function
            )
            print(f"Created new collection '{RAG_COLLECTION}'")
            
            # Clear the state file to force reprocessing all files
            state_path = os.path.join(chroma_db.persist_directory, ".md_index_state.json")
            if os.path.exists(state_path):
                os.remove(state_path)
                print("Cleared state file to force reprocessing all files")
        except Exception as e:
            print(f"Error rebuilding collection: {str(e)}")
    
    # Define directories to process
    pages_dir = "markdown-data/pages_markdown"
    posts_dir = "markdown-data/posts_markdown"
    
    # Define improved chunking parameters
    chunk_size = 1500  # Increased for better context
    chunk_overlap = 250  # Increased for better overlap
    
    start_time = time.time()
    
    # Process pages markdown files
    print(f"\nEmbedding pages from {pages_dir}...")
    pages_count = chroma_db.embed_markdown_files(
        markdown_dir=pages_dir,
        collection_name=RAG_COLLECTION,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Process posts markdown files
    print(f"\nEmbedding posts from {posts_dir}...")
    posts_count = chroma_db.embed_markdown_files(
        markdown_dir=posts_dir,
        collection_name=RAG_COLLECTION,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Calculate total embedded documents
    total_count = pages_count + posts_count
    
    # Get collection info
    collection_info = chroma_db.get_collection_info(RAG_COLLECTION)
    
    # Print results
    print("\n=== Embedding Results ===")
    print(f"Pages embedded: {pages_count} chunks")
    print(f"Posts embedded: {posts_count} chunks")
    print(f"Total embedded: {total_count} chunks")
    print(f"Collection count: {collection_info.get('count', 0)} documents")
    print(f"Embedding model: {collection_info.get('embedding_model', 'all-MiniLM-L6-v2')}")
    print(f"Time taken: {time.time() - start_time:.2f} seconds")
    
    # Print content types if available
    content_types = collection_info.get('content_types', {})
    if content_types:
        print("\n=== Content Types ===")
        for content_type, count in content_types.items():
            print(f"{content_type}: {count} documents")
    
    # Print unique sources count
    unique_sources = collection_info.get('unique_sources', 0)
    print(f"\nUnique sources: {unique_sources}")
    
    print("\n=== Next Steps ===")
    print("1. Test the RAG system with queries")
    print("2. Monitor the logs for any errors")
    print("3. Adjust relevance thresholds if needed")

if __name__ == "__main__":
    embed_markdown_files()