import os
import re
import json
from typing import Dict, List, Any, Optional
from langchain_community.document_loaders import TextLoader
from utils.logger import app_logger

class DocumentProcessor:
    """
    Process documents to extract metadata and categorize content
    """
    
    def __init__(self):
        # Technology categories and keywords
        self.tech_categories = {
            "web_development": ["PHP", "Laravel", "CodeIgniter", "WordPress", "Magento", "Shopify", "WooCommerce", "HTML", "CSS", "JavaScript"],
            "frontend": ["React", "Angular", "Vue", "JavaScript", "TypeScript", "HTML", "CSS", "SASS", "LESS"],
            "backend": ["PHP", "Python", "Node.js", "Java", "C#", ".NET", "Ruby", "Go"],
            "mobile": ["iOS", "Android", "React Native", "Flutter", "Swift", "Kotlin", "Mobile Apps"],
            "ecommerce": ["Magento", "Shopify", "WooCommerce", "OpenCart", "PrestaShop", "eCommerce"],
            "cms": ["WordPress", "Drupal", "Joomla", "TYPO3", "CMS"],
            "database": ["MySQL", "PostgreSQL", "MongoDB", "SQL Server", "Oracle", "Redis", "ElasticSearch"],
            "devops": ["Docker", "Kubernetes", "AWS", "Azure", "GCP", "CI/CD", "Jenkins", "GitLab"],
            "ai_ml": ["AI", "ML", "Machine Learning", "Deep Learning", "NLP", "Computer Vision", "AI/ML", "Data Science"],
            "cloud": ["AWS", "Azure", "GCP", "Cloud Computing", "SaaS", "PaaS", "IaaS", "SAAS"],
            "frameworks": ["Laravel", "Django", "Flask", "Express", "Spring", "ASP.NET", "Ruby on Rails"]
        }
        
        # Service categories
        self.service_categories = {
            "web_design": ["Web Design", "UI/UX", "Responsive Design", "Website Design"],
            "web_development": ["Web Development", "Website Development", "Web App Development"],
            "mobile_development": ["Mobile App Development", "iOS Development", "Android Development"],
            "ecommerce_development": ["eCommerce Development", "Online Store", "Shopping Cart"],
            "custom_development": ["Custom Development", "Bespoke Software", "Custom Software"],
            "digital_marketing": ["Digital Marketing", "SEO", "PPC", "Social Media Marketing"],
            "maintenance": ["Maintenance", "Support", "Updates", "Bug Fixes"]
        }
    
    def extract_metadata_from_content(self, content: str, filename: str) -> Dict[str, Any]:
        """
        Extract metadata from document content
        
        Args:
            content: Document content
            filename: Source filename
            
        Returns:
            Dictionary of metadata
        """
        # Initialize metadata
        metadata = {
            "source": filename,
            "categories": [],
            "keywords": [],
            "technologies": []
        }
        
        # Determine if it's a page or post
        if "pages_markdown" in filename:
            metadata["document_type"] = "page"
        elif "posts_markdown" in filename:
            metadata["document_type"] = "post"
        else:
            metadata["document_type"] = "other"
        
        # Extract title from markdown
        title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
        if title_match:
            metadata["title"] = title_match.group(1)
        else:
            # Use filename as fallback title
            base_filename = os.path.basename(filename)
            metadata["title"] = os.path.splitext(base_filename)[0].replace('-', ' ').title()
        
        # Identify technologies mentioned
        all_tech_keywords = []
        for category, keywords in self.tech_categories.items():
            found_keywords = []
            for keyword in keywords:
                # Use word boundaries to avoid partial matches
                pattern = r'\b' + re.escape(keyword) + r'\b'
                if re.search(pattern, content, re.IGNORECASE):
                    found_keywords.append(keyword)
            
            if found_keywords:
                metadata["technologies"].append(category)
                all_tech_keywords.extend(found_keywords)
        
        # Identify services mentioned
        for category, keywords in self.service_categories.items():
            for keyword in keywords:
                pattern = r'\b' + re.escape(keyword) + r'\b'
                if re.search(pattern, content, re.IGNORECASE):
                    if category not in metadata["categories"]:
                        metadata["categories"].append(category)
        
        # Add unique keywords
        metadata["keywords"] = list(set(all_tech_keywords))
        
        # Add MangoIT as company
        metadata["company"] = "MangoIT"
        
        return metadata
    
    def process_document(self, file_path: str) -> Dict[str, Any]:
        """
        Process a document and extract its content and metadata
        
        Args:
            file_path: Path to the document
            
        Returns:
            Dictionary with content and metadata
        """
        try:
            # Load the document
            loader = TextLoader(file_path)
            documents = loader.load()
            
            if not documents:
                app_logger.warning(f"No content found in {file_path}")
                return None
            
            content = documents[0].page_content
            
            # Extract metadata
            metadata = self.extract_metadata_from_content(content, file_path)
            
            return {
                "content": content,
                "metadata": metadata
            }
        except Exception as e:
            app_logger.error(f"Error processing document {file_path}: {str(e)}")
            return None