from crewai import Agent
import os
import time
import asyncio
from typing import Dict, Any, Optional, List
from dotenv import load_dotenv
from utils.logger import Logger, app_logger
from utils.query_planner import QueryPlanner
from utils.llm_config import get_llm

load_dotenv()

class QueryUnderstandingAgent:
    """Agent responsible for understanding user queries and extracting key information."""
    
    def __init__(self, model_name: str = None, temperature: float = None, logger: Optional[Logger] = None, ontology_path: str = "kb/ontology.yaml"):
        """Initialize the QueryUnderstandingAgent
        
        Args:
            model_name: Optional override for the LLM model to use
            temperature: Optional override for the temperature
            logger: Optional logger instance
            ontology_path: Path to the ontology YAML file
        """
        # Get LLM from centralized configuration
        self.llm = get_llm(model_name=model_name, temperature=temperature)
        self.logger = logger or app_logger
        self.name = "QueryUnderstandingAgent"
        
        # Initialize the query planner
        self.query_planner = QueryPlanner(ontology_path=ontology_path, model_name=model_name, logger=self.logger)
    
    def create_agent(self):
        return Agent(
            role="Query Analyst",
            goal="Understand user queries and extract key information for effective retrieval",
            backstory="""You are an expert at understanding user questions and extracting the 
            key information needed for effective information retrieval. You can identify the 
            main topics, entities, and intent behind user queries.""",
            verbose=True,
            llm=self.llm
        )
    
    def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
        """Run the agent to analyze the query
        
        Args:
            context: The context containing the query and other information
            
        Returns:
            Dict[str, Any]: The analysis results and updated context
        """
        start_time = time.time()
        query = context.get("query", "")
        
        if not query:
            self.logger.error(f"{self.name}: No query provided in context")
            return {**context, "error": "No query provided"}
            
        self.logger.log_agent_start(self.name, f"Analyzing query: {query[:50]}..." if len(query) > 50 else query)
        
        try:
            # Analyze the query
            analysis = self.analyze_query(query)
            execution_time = time.time() - start_time
            
            self.logger.log_agent_complete(self.name, execution_time)
            self.logger.info(f"{self.name}: Extracted {len(analysis.get('keywords', []))} keywords and {len(analysis.get('topics', []))} topics")
            
            # Update context with analysis results
            return {**context, "query_analysis": analysis, "error": None}
            
        except Exception as e:
            execution_time = time.time() - start_time
            self.logger.log_agent_error(self.name, e)
            return {**context, "error": str(e)}
    
    def analyze_query(self, query: str) -> Dict[str, Any]:
        """
        Analyze the user's query to extract key information using QueryPlanner
        
        Args:
            query: The user's query
            
        Returns:
            Dict[str, Any]: Analysis results with search plan
        """
        self.logger.info(f"Analyzing query: {query}")
        
        try:
            # Use the query planner to generate a search plan
            search_plan = self.query_planner.plan_query(query)
            self.logger.info(f"Generated search plan with intent: {search_plan.get('intent')}")
            
            # Use LLM to analyze the query for additional information
            analysis = self._analyze_with_llm(query)
            
            # Enhance the analysis with the search plan
            analysis["search_plan"] = search_plan
            
            # Use multi-queries for better retrieval
            if search_plan.get("multi_queries"):
                analysis["multi_queries"] = search_plan["multi_queries"]
            
            # Add intent information
            analysis["intent"] = search_plan.get("intent", "lookup")
            
            # For technology listing requests, enhance keywords
            if search_plan.get("intent") == "list" and "technologies" in search_plan.get("facets", []):
                tech_keywords = [
                    "technologies", "programming", "languages", "frameworks", "tools", 
                    "platforms", "wordpress", "php", "javascript", "react", "laravel", 
                    "python", "ai", "development", "stack", "expertise"
                ]
                analysis["keywords"] = list(set(analysis.get("keywords", []) + tech_keywords))
                analysis["is_tech_list_request"] = True
            
            # If it's a greeting, we don't need to extract keywords for search
            if analysis["query_type"] == "greeting":
                analysis["search_query"] = ""  # No need to search for greetings
                return analysis
            
            # Create search query from keywords or use multi-queries
            if analysis["keywords"]:
                analysis["search_query"] = " ".join(analysis["keywords"])
            else:
                analysis["search_query"] = query
            
            return analysis
            
        except Exception as e:
            self.logger.error(f"Error analyzing query: {str(e)}")
            return self._simple_analysis(query)
    
    def _moderation_check(self, text: str) -> Dict[str, Any]:
        """
        Check if text contains abusive content
        
        Args:
            text: Text to check
            
        Returns:
            Dict[str, Any]: Moderation results
        """
        # Simple keyword-based check (can be enhanced with a model)
        abuse_keywords = [
            "fuck", "shit", "ass", "bitch", "dick", "porn", "kill", "suicide",
            "hate", "racist", "nazi", "terrorism", "bomb", "murder", "rape"
        ]
        
        text_lower = text.lower()
        for keyword in abuse_keywords:
            if keyword in text_lower:
                return {"is_abusive": True, "category": "profanity", "severity": "medium"}
        
        return {"is_abusive": False, "category": None, "severity": None}
    
    def _is_smalltalk(self, text: str) -> bool:
        """
        Check if text is smalltalk/greeting
        
        Args:
            text: Text to check
            
        Returns:
            bool: True if text is smalltalk
        """
        smalltalk_patterns = [
            "hi", "hello", "hey", "good morning", "good afternoon", "good evening",
            "how are you", "what's up", "namaste", "kaise ho", "kya hal hai",
            "👋", "🙏", "😊", "thank you", "thanks"
        ]
        
        text_lower = text.lower()
        
        # Check for common greeting patterns
        for pattern in smalltalk_patterns:
            if pattern in text_lower:
                return True
        
        # Short messages are likely greetings
        if len(text_lower.split()) <= 4:
            return True
            
        return False
        
    def _detect_meeting_intent(self, text: str) -> Dict[str, Any]:
        """
        Detect if the text contains a meeting request intent
        
        Args:
            text: Text to check
            
        Returns:
            Dict[str, Any]: Meeting intent detection results
        """
        # Enhanced keywords that might indicate a meeting request
        meeting_keywords = [
            # Direct meeting terms
            "meeting", "appointment", "schedule", "book", "consult", 
            "consultation", "discuss", "talk", "call", "meet",
            # Business terms
            "pricing", "quote", "estimate", "cost", "price",
            "proposal", "project", "service", "hire", "engage",
            # Time-related terms
            "available", "availability", "time", "slot", "calendar",
            "tomorrow", "next week", "monday", "tuesday", "wednesday", 
            "thursday", "friday", "weekend", "morning", "afternoon", "evening",
            # Intent phrases
            "would like to", "want to", "interested in", "looking for", 
            "need to", "can we", "possible to"
        ]
        
        # Phrase patterns that strongly indicate meeting intent
        meeting_phrases = [
            "set up a meeting", "schedule a call", "book an appointment",
            "arrange a consultation", "get in touch", "speak with", "talk to someone",
            "discuss my project", "discuss my requirements", "get a quote",
            "learn more about your services", "would like to meet", "want to meet",
            "need consultation", "interested in your services", "when are you available",
            "what's your availability", "can we meet", "can we talk", "can we discuss"
        ]
        
        # Check for meeting intent in the message
        text_lower = text.lower()
        
        # Check for keyword matches
        found_keywords = [keyword for keyword in meeting_keywords if keyword in text_lower]
        
        # Check for phrase matches (stronger indicators)
        found_phrases = [phrase for phrase in meeting_phrases if phrase in text_lower]
        
        # Calculate confidence based on keywords and phrases found
        # Phrases are stronger indicators so they get higher weight
        keyword_score = min(len(found_keywords) * 0.15, 0.6)
        phrase_score = min(len(found_phrases) * 0.3, 0.9)
        confidence = max(keyword_score, phrase_score)
        
        # Boost confidence if multiple indicators are present
        if keyword_score > 0 and phrase_score > 0:
            confidence = min(confidence + 0.1, 0.95)
            
        # Boost confidence if message contains date/time information
        date_time_indicators = ["monday", "tuesday", "wednesday", "thursday", "friday",
                              "weekend", "tomorrow", "next week", "morning", "afternoon", 
                              "evening", "pm", "am", ":00", "o'clock"]
        if any(indicator in text_lower for indicator in date_time_indicators):
            confidence = min(confidence + 0.1, 0.95)
        
        # Determine if this is likely a meeting request
        is_meeting_request = confidence > 0.25  # Lower threshold for better recall
        
        # Extract information if it's a meeting request
        extracted_info = {}
        missing_fields = []
        
        if is_meeting_request:
            # Extract name
            name_indicators = ["name is", "i am", "i'm", "my name", "this is"]
            name_found = False
            for indicator in name_indicators:
                if indicator in text_lower:
                    parts = text.split(indicator, 1)
                    if len(parts) > 1:
                        name_text = parts[1].strip()
                        # Extract up to the first punctuation or line break
                        for end_char in ['.', ',', '!', '?', '\n']:
                            if end_char in name_text:
                                name_text = name_text.split(end_char, 1)[0]
                        extracted_info["name"] = name_text.strip()
                        name_found = True
                        break
            
            if not name_found:
                missing_fields.append("name")
            
            # Extract email
            import re
            email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
            email_matches = re.findall(email_pattern, text)
            if email_matches:
                extracted_info["email"] = email_matches[0]
            else:
                missing_fields.append("email")
            
            # Extract phone number
            phone_pattern = r'\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
            phone_matches = re.findall(phone_pattern, text)
            if phone_matches:
                extracted_info["phone"] = phone_matches[0]
            else:
                missing_fields.append("phone")
            
            # Extract topic
            topic_indicators = ["about", "regarding", "discuss", "talk about", "meeting about", 
                              "consultation for", "interested in", "need help with", "looking for"]
            suggested_topic = None
            
            for indicator in topic_indicators:
                if indicator in text_lower:
                    parts = text.split(indicator, 1)
                    if len(parts) > 1:
                        topic_text = parts[1].strip()[:100]
                        for end_char in ['.', '!', '?', '\n']:
                            if end_char in topic_text:
                                topic_text = topic_text.split(end_char, 1)[0]
                        suggested_topic = topic_text.strip()
                        extracted_info["topic"] = suggested_topic
                        break
            
            # If no specific topic was found but it's a meeting request
            if not suggested_topic:
                # Use a generic topic based on keywords found
                if any(kw in text_lower for kw in ["website", "web", "site"]):
                    suggested_topic = "Website Development"
                    extracted_info["topic"] = suggested_topic
                elif any(kw in text_lower for kw in ["app", "mobile", "android", "ios"]):
                    suggested_topic = "Mobile App Development"
                    extracted_info["topic"] = suggested_topic
                elif any(kw in text_lower for kw in ["ai", "machine learning", "ml", "artificial intelligence"]):
                    suggested_topic = "AI/ML Development"
                    extracted_info["topic"] = suggested_topic
                else:
                    suggested_topic = "General Consultation"
                    extracted_info["topic"] = suggested_topic
            
            if "topic" not in extracted_info:
                missing_fields.append("topic")
        
        return {
            "is_meeting_request": is_meeting_request,
            "confidence": confidence,
            "keywords": found_keywords,
            "phrases": found_phrases,
            "extracted_info": extracted_info,
            "missing_fields": missing_fields,
            "suggested_topic": extracted_info.get("topic")
        }
    
    def _analyze_with_llm(self, query: str) -> Dict[str, Any]:
        """
        Use the LLM to analyze the query and extract key information
        
        Args:
            query: The user's query
            
        Returns:
            Dict[str, Any]: Analysis results with routing
        """
        # Check for abuse first
        moderation = self._moderation_check(query)
        if moderation["is_abusive"]:
            self.logger.warning(f"Detected abusive content: {moderation['category']}")
            return {
                "query_type": "other",
                "keywords": [],
                "entities": [],
                "topics": [],
                "search_query": "",
                "route": "abuse",
                "moderation": moderation,
                "original_query": query
            }
        
        # Check for meeting intent first - this is a high priority detection
        meeting_intent = self._detect_meeting_intent(query)
        if meeting_intent["is_meeting_request"]:
            self.logger.info(f"Detected meeting request with confidence: {meeting_intent['confidence']}")
            return {
                "query_type": "request",
                "keywords": ["meeting", "appointment", "consultation"] + meeting_intent.get("keywords", []),
                "entities": meeting_intent.get("entities", []),
                "topics": ["contact", "services"],
                "search_query": "contact consultation appointment",
                "route": "meeting",
                "meeting_intent": meeting_intent,
                "original_query": query
            }
        
        # Check for smalltalk
        if self._is_smalltalk(query):
            self.logger.info("Detected smalltalk/greeting")
            return {
                "query_type": "greeting",
                "keywords": [],
                "entities": [],
                "topics": [],
                "search_query": "",
                "route": "smalltalk",
                "original_query": query
            }
        
        # Create a prompt for the LLM to analyze the query
        # Add company context to help with keyword extraction
        company_context = """
        Company Context: MangoIT Solutions is a web and mobile app development company that offers various services including:
        - Web Development (PHP, Laravel, CodeIgniter, WordPress, Magento)
        - Mobile App Development (iOS, Android, React Native)
        - eCommerce Development (Magento, Shopify, WooCommerce)
        - Custom Software Development
        - AI/ML Development
        - Digital Marketing
        - UI/UX Design
        
        They work with technologies like PHP, Python, JavaScript, React, Angular, Node.js, and various frameworks.
        """
        
        prompt = f"""
        {company_context}
        
        Analyze the following user query and extract key information:
        
        Query: "{query}"
        
        Provide the following information in your analysis:
        1. Query Type: Classify as one of [greeting, question, request, about, general_conversation]
           - greeting: Simple greetings like "hello", "hi", "good morning", etc.  
           - question: Queries asking for specific information
           - request: Commands or requests for the system to do something
           - about: Questions about the assistant itself
           - general_conversation: General statements or comments
        
        2. Keywords: Extract 3-5 most important keywords relevant for search (exclude common words). Include specific technology names or service names when relevant.
        
        3. Entities: Identify any named entities (people, companies, products, etc.)
        
        4. Topics: Identify relevant topics from [services, products, company, contact, technologies, pricing, portfolio]
        
        Format your response as a JSON object with these fields.
        """
        
        try:
            # Call the LLM to analyze the query - avoid using asyncio.run() in an event loop
            try:
                response = self.llm.generate(prompt)
            except RuntimeError as e:
                if "cannot be called from a running event loop" in str(e):
                    # We're already in an event loop, use a simpler approach
                    self.logger.warning("Using fallback analysis due to asyncio conflict")
                    return self._simple_analysis(query)
                else:
                    raise
            
            # Parse the response as JSON
            import json
            import re
            
            # Extract JSON from the response (it might be wrapped in markdown code blocks)
            json_match = re.search(r'```json\s*(.+?)\s*```', response, re.DOTALL)
            if json_match:
                json_str = json_match.group(1)
            else:
                # Try to find any JSON-like structure
                json_match = re.search(r'\{[^\{\}]*"query_type"[^\{\}]*\}', response, re.DOTALL)
                if json_match:
                    json_str = json_match.group(0)
                else:
                    json_str = response
            
            # Clean up the string to ensure it's valid JSON
            json_str = re.sub(r'[\n\r\t]', ' ', json_str)
            
            try:
                analysis = json.loads(json_str)
            except json.JSONDecodeError:
                # If JSON parsing fails, extract information using regex
                self.logger.warning("Failed to parse LLM response as JSON, falling back to regex extraction")
                analysis = self._extract_analysis_with_regex(response)
            
            # Ensure all required fields are present
            analysis.setdefault("query_type", "general_conversation")
            analysis.setdefault("keywords", [])
            analysis.setdefault("entities", [])
            analysis.setdefault("topics", [])
            analysis["original_query"] = query
            
            # Set default route to RAG
            analysis["route"] = "rag"
            
            # Ensure we have a search query for non-greeting queries
            if not analysis.get("search_query") and analysis.get("query_type") != "greeting":
                if analysis["keywords"]:
                    analysis["search_query"] = " ".join(analysis["keywords"])
                else:
                    analysis["search_query"] = query
            
            # Log the route and search query
            self.logger.info(f"Query route: {analysis['route']}")
            if analysis["route"] == "rag":
                self.logger.info(f"Search query: {analysis.get('search_query', '')}")
            
            return analysis
            
        except Exception as e:
            self.logger.error(f"Error analyzing query with LLM: {str(e)}")
            # Fall back to simple analysis if LLM fails
            simple_analysis = self._simple_analysis(query)
            simple_analysis["route"] = "rag"  # Default route for fallback
            return simple_analysis
    
    def _extract_analysis_with_regex(self, llm_response: str) -> Dict[str, Any]:
        """
        Extract analysis information from LLM response using regex
        
        Args:
            llm_response: The response from the LLM
            
        Returns:
            Dict[str, Any]: Extracted analysis
        """
        import re
        
        analysis = {}
        
        # Extract query type
        query_type_match = re.search(r'query_type["\s]*:["\'\s]*(greeting|question|request|about|general_conversation)', llm_response, re.IGNORECASE)
        if query_type_match:
            analysis["query_type"] = query_type_match.group(1).lower()
        
        # Extract keywords
        keywords_match = re.search(r'keywords["\s]*:["\'\s]*\[(.+?)\]', llm_response, re.DOTALL)
        if keywords_match:
            keywords_str = keywords_match.group(1)
            keywords = re.findall(r'["\'](.*?)["\']', keywords_str)
            analysis["keywords"] = keywords
        
        # Extract entities
        entities_match = re.search(r'entities["\s]*:["\'\s]*\[(.+?)\]', llm_response, re.DOTALL)
        if entities_match:
            entities_str = entities_match.group(1)
            entities = re.findall(r'["\'](.*?)["\']', entities_str)
            analysis["entities"] = entities
        
        # Extract topics
        topics_match = re.search(r'topics["\s]*:["\'\s]*\[(.+?)\]', llm_response, re.DOTALL)
        if topics_match:
            topics_str = topics_match.group(1)
            topics = re.findall(r'["\'](.*?)["\']', topics_str)
            analysis["topics"] = topics
        
        return analysis
    
    async def arun(self, context: Dict[str, Any]) -> Dict[str, Any]:
        """
        Async version of run method with QueryPlanner integration
        
        Args:
            context: The context containing the query and other information
            
        Returns:
            Dict[str, Any]: The analysis results and updated context
        """
        start_time = time.time()
        query = context.get("query", "")
        
        if not query:
            self.logger.error(f"{self.name}: No query provided in context")
            return {**context, "error": "No query provided"}
            
        self.logger.log_agent_start(self.name, f"Analyzing query: {query[:50]}..." if len(query) > 50 else query)
        
        try:
            # Use the query planner to generate a search plan asynchronously
            search_plan = await self.query_planner.plan_query_async(query)
            self.logger.info(f"Generated search plan with intent: {search_plan.get('intent')}")
            
            # Use async LLM analysis
            analysis = await self._analyze_with_llm_async(query)
            
            # Enhance the analysis with the search plan
            analysis["search_plan"] = search_plan
            
            # Use multi-queries for better retrieval
            if search_plan.get("multi_queries"):
                analysis["multi_queries"] = search_plan["multi_queries"]
            
            # Add intent information
            analysis["intent"] = search_plan.get("intent", "lookup")
            
            # For technology listing requests, enhance keywords
            if search_plan.get("intent") == "list" and "technologies" in search_plan.get("facets", []):
                tech_keywords = [
                    "technologies", "programming", "languages", "frameworks", "tools", 
                    "platforms", "wordpress", "php", "javascript", "react", "laravel", 
                    "python", "ai", "development", "stack", "expertise"
                ]
                analysis["keywords"] = list(set(analysis.get("keywords", []) + tech_keywords))
                analysis["is_tech_list_request"] = True
            
            execution_time = time.time() - start_time
            self.logger.log_agent_complete(self.name, execution_time)
            self.logger.info(f"{self.name}: Extracted {len(analysis.get('keywords', []))} keywords and {len(analysis.get('topics', []))} topics")
            
            # Update context with analysis results
            return {**context, "query_analysis": analysis, "error": None}
            
        except Exception as e:
            execution_time = time.time() - start_time
            self.logger.log_agent_error(self.name, e)
            # Fallback to basic analysis with original query
            return {**context, "query_analysis": self._simple_analysis(query), "error": str(e)}
    
    async def _analyze_with_llm_async(self, query: str) -> Dict[str, Any]:
        """
        Analyze query using LLM asynchronously
        
        Args:
            query: The user's query
            
        Returns:
            Dict[str, Any]: Analysis results
        """
        prompt = self._build_prompt(query)
        # Run LLM in thread to avoid blocking
        response = await asyncio.to_thread(self.llm.generate, prompt)
        
        # Parse the response as JSON
        import json
        import re
        
        # Extract JSON from the response (it might be wrapped in markdown code blocks)
        json_match = re.search(r'```json\s*(.+?)\s*```', response, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Try to find any JSON-like structure
            json_match = re.search(r'\{[^\{\}]*"query_type"[^\{\}]*\}', response, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
            else:
                json_str = response
        
        # Clean up the string to ensure it's valid JSON
        json_str = re.sub(r'[\n\r\t]', ' ', json_str)
        
        try:
            analysis = json.loads(json_str)
        except json.JSONDecodeError:
            # If JSON parsing fails, extract information using regex
            self.logger.warning("Failed to parse LLM response as JSON, falling back to regex extraction")
            analysis = self._extract_analysis_with_regex(response)
        
        # Ensure all required fields are present
        analysis.setdefault("query_type", "general_conversation")
        analysis.setdefault("keywords", [])
        analysis.setdefault("entities", [])
        analysis.setdefault("topics", [])
        analysis["original_query"] = query
        
        # Ensure we have a search query
        if not analysis.get("search_query") and analysis.get("query_type") != "greeting":
            analysis["search_query"] = query
        
        return analysis
    
    def _build_prompt(self, query: str) -> str:
        """
        Build prompt for LLM analysis
        
        Args:
            query: The user's query
            
        Returns:
            str: Prompt for LLM
        """
        return f"""
        Analyze the following user query and extract key information:
        
        Query: \"{query}\"
        
        Provide the following information in your analysis:
        1. Query Type: Classify as one of [greeting, question, request, about, general_conversation]
           - greeting: Simple greetings like \"hello\", \"hi\", \"good morning\", etc.  
           - question: Queries asking for specific information
           - request: Commands or requests for the system to do something
           - about: Questions about the assistant itself
           - general_conversation: General statements or comments
        
        2. Keywords: Extract 3-5 most important keywords relevant for search (exclude common words)
        
        3. Entities: Identify any named entities (people, companies, products, etc.)
        
        4. Topics: Identify relevant topics from [services, products, company, contact, technologies, pricing, portfolio]
        
        Format your response as a JSON object with these fields.
        """
    
    def _simple_analysis(self, query: str) -> Dict[str, Any]:
        """
        Simple fallback analysis when LLM fails
        
        Args:
            query: The user's query
            
        Returns:
            Dict[str, Any]: Simple analysis results
        """
        query_lower = query.lower().strip()
        query_words = query_lower.split()
        
        # Simple query type detection
        query_type = "general_conversation"
        
        # Check for greetings
        greeting_patterns = ["hi", "hello", "hey", "greetings", "good morning", "good afternoon", "good evening"]
        if len(query_words) <= 3 and any(pattern in query_lower for pattern in greeting_patterns):
            query_type = "greeting"
        elif query_lower.endswith("?") or any(q in query_lower for q in ["how", "what", "when", "where", "who", "why"]):
            query_type = "question"
        
        # Enhanced keyword extraction
        common_words = ["the", "a", "an", "in", "on", "at", "to", "for", "with", "about", "of", "is", "are", "am", 
                      "how", "many", "much", "do", "does", "can", "could", "would", "should", "will"]
        
        # Always include company name as a keyword for company-related queries
        keywords = []
        if "mangoit" in query_lower or "mango it" in query_lower:
            keywords.append("mangoit")
            
        # Extract other important keywords
        for word in query_words:
            if word not in common_words and len(word) > 2 and word not in keywords:
                keywords.append(word)
                
        # If no keywords found but query is about the company, use some default keywords
        if not keywords and ("mangoit" in query_lower or "mango it" in query_lower or "company" in query_lower):
            keywords = ["mangoit", "company", "information"]
            
        # For questions about people or numbers, add specific keywords
        if "how many" in query_lower or "number of" in query_lower:
            if "employee" in query_lower or "staff" in query_lower or "people" in query_lower or "professionals" in query_lower:
                keywords.extend(["employees", "team", "size"])
                
        # For questions about specific roles or people
        if "who" in query_lower:
            if "ceo" in query_lower or "cto" in query_lower or "founder" in query_lower or "director" in query_lower:
                keywords.extend(["leadership", "management", "team"])
        
        return {
            "original_query": query,
            "query_type": query_type,
            "keywords": keywords,
            "entities": [],
            "topics": []
        }