Spaces:

Agents-MCP-Hackathon
/

Intelligent_Content_Organizer

Running

App Files Files Community

Nihal2000 commited on Jun 6

Commit

9145e48

1 Parent(s): 644b0c5

Gradio mcp

Browse files

Files changed (19) hide show

app.py +676 -214
core/__init__.py +1 -0
core/chunker.py +302 -0
core/document_parser.py +199 -0
core/models.py +102 -0
core/text_preprocessor.py +186 -0
mcp_server.py +165 -70
mcp_tools.py +0 -592
mcp_tools/__init__.py +1 -0
mcp_tools/generative_tool.py +342 -0
mcp_tools/ingestion_tool.py +330 -0
mcp_tools/search_tool.py +423 -0
mcp_tools/utils.py +373 -0
services/__init__.py +1 -0
services/document_store_service.py +349 -0
services/embedding_service.py +204 -0
services/llm_service.py +285 -0
services/ocr_service.py +324 -0
services/vector_store_service.py +285 -0

app.py CHANGED Viewed

@@ -1,254 +1,716 @@
 import gradio as gr
 import asyncio
-from pathlib import Path
-import tempfile
 import json
-from typing import List, Dict, Any
 import logging
-from config import Config
-from mcp_server import mcp
-# Handle imports based on how the app is run
-try:
-    from mcp_server import mcp
-    MCP_AVAILABLE = True
-except ImportError:
-    MCP_AVAILABLE = False
-    print("⚠️ MCP server not available, running in standalone mode")
-import mcp_tools
-# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Validate configuration on startup
-try:
-    Config.validate()
-except ValueError as e:
-    logger.error(f"Configuration error: {e}")
-    print(f"⚠️ Configuration error: {e}")
-    print("Please set the required API keys in your environment variables or .env file")
-# Global state for search results
-current_results = []
-async def process_file_handler(file):
-    """Handle file upload and processing"""
     if file is None:
-        return "Please upload a file", "", "", None
     try:
-        # Process the file
-        result = await mcp_tools.process_local_file(file.name)
-        if result.get("success"):
-            tags_display = ", ".join(result["tags"])
             return (
-                f"✅ Successfully processed: {result['file_name']}",
-                result["summary"],
-                tags_display,
-                gr.update(visible=True, value=create_result_card(result))
             )
         else:
-            return f"❌ Error: {result.get('error', 'Unknown error')}", "", "", None
     except Exception as e:
-        logger.error(f"Error in file handler: {str(e)}")
-        return f"❌ Error: {str(e)}", "", "", None
-async def process_url_handler(url):
-    """Handle URL processing"""
-    if not url:
-        return "Please enter a URL", "", "", None
     try:
-        # Process the URL
-        result = await mcp_tools.process_web_content(url)
-        if result.get("success"):
-            tags_display = ", ".join(result["tags"])
-            return (
-                f"✅ Successfully processed: {url}",
-                result["summary"],
-                tags_display,
-                gr.update(visible=True, value=create_result_card(result))
-            )
         else:
-            return f"❌ Error: {result.get('error', 'Unknown error')}", "", "", None
     except Exception as e:
-        logger.error(f"Error in URL handler: {str(e)}")
-        return f"❌ Error: {str(e)}", "", "", None
-async def search_handler(query):
-    """Handle semantic search"""
-    if not query:
-        return [], "Please enter a search query"
     try:
-        # Perform search
-        results = await mcp_tools.search_knowledge_base(query, limit=10)
-        if results:
-            # Create display cards for each result
-            result_cards = []
-            for result in results:
-                card = f"""
-                ### 📄 {result.get('source', 'Unknown Source')}
-                **Tags:** {', '.join(result.get('tags', []))}
-                **Summary:** {result.get('summary', 'No summary available')}
-                **Relevance:** {result.get('relevance_score', 0):.2%}
-                ---
-                """
-                result_cards.append(card)
-            global current_results
-            current_results = results
-            return result_cards, f"Found {len(results)} results"
         else:
-            return [], "No results found"
     except Exception as e:
-        logger.error(f"Error in search: {str(e)}")
-        return [], f"Error: {str(e)}"
-def create_result_card(result: Dict[str, Any]) -> str:
-    """Create a formatted result card"""
-    return f"""
-    ### 📋 Processing Complete
-    **Document ID:** {result.get('doc_id', 'N/A')}
-    **Source:** {result.get('file_name', result.get('url', 'Unknown'))}
-    **Tags:** {', '.join(result.get('tags', []))}
-    **Summary:** {result.get('summary', 'No summary available')}
-    **Chunks Processed:** {result.get('chunks_processed', 0)}
-    """
-# Create Gradio interface
-with gr.Blocks(title="Intelligent Content Organizer - MCP Agent") as demo:
-    gr.Markdown("""
-    # 🧠 Intelligent Content Organizer
-    ### MCP-Powered Knowledge Management System
-    This AI-driven system automatically organizes, enriches, and retrieves your digital content.
-    Upload files or provide URLs to build your personal knowledge base with automatic tagging and semantic search.
-    ---
-    """)
-    with gr.Tabs():
-        # File Processing Tab
-        with gr.TabItem("📁 Process Files"):
-            with gr.Row():
-                with gr.Column():
-                    file_input = gr.File(
-                        label="Upload Document",
-                        file_types=[".pdf", ".txt", ".docx", ".doc", ".html", ".md", ".csv", ".json"]
-                    )
-                    file_process_btn = gr.Button("Process File", variant="primary")
-                with gr.Column():
-                    file_status = gr.Textbox(label="Status", lines=1)
-                    file_summary = gr.Textbox(label="Generated Summary", lines=3)
-                    file_tags = gr.Textbox(label="Generated Tags", lines=1)
-            file_result = gr.Markdown(visible=False)
-        # URL Processing Tab
-        with gr.TabItem("🌐 Process URLs"):
-            with gr.Row():
-                with gr.Column():
-                    url_input = gr.Textbox(
-                        label="Enter URL",
-                        placeholder="https://example.com/article"
-                    )
-                    url_process_btn = gr.Button("Process URL", variant="primary")
-                with gr.Column():
-                    url_status = gr.Textbox(label="Status", lines=1)
-                    url_summary = gr.Textbox(label="Generated Summary", lines=3)
-                    url_tags = gr.Textbox(label="Generated Tags", lines=1)
-            url_result = gr.Markdown(visible=False)
-        # Search Tab
-        with gr.TabItem("🔍 Semantic Search"):
-            search_input = gr.Textbox(
-                label="Search Query",
-                placeholder="Enter your search query...",
-                lines=1
-            )
-            search_btn = gr.Button("Search", variant="primary")
-            search_status = gr.Textbox(label="Status", lines=1)
-            search_results = gr.Markdown(label="Search Results")
-        # MCP Server Info Tab
-        with gr.TabItem("ℹ️ MCP Server Info"):
-            gr.Markdown("""
-            ### MCP Server Configuration
-            This Gradio app also functions as an MCP (Model Context Protocol) server, allowing integration with:
-            - Claude Desktop
-            - Cursor
-            - Other MCP-compatible clients
-            **Server Name:** intelligent-content-organizer
-            **Available Tools:**
-            - `process_file`: Process local files and extract content
-            - `process_url`: Fetch and process web content
-            - `semantic_search`: Search across stored documents
-            - `get_document_summary`: Get detailed document information
-            **To use as MCP server:**
-            1. Add this server to your MCP client configuration
-            2. Use the tools listed above to interact with your knowledge base
-            3. All processed content is automatically indexed for semantic search
-            **Tags:** mcp-server-track
-            """)
-    # Event handlers
-    file_process_btn.click(
-        fn=lambda x: asyncio.run(process_file_handler(x)),
-        inputs=[file_input],
-        outputs=[file_status, file_summary, file_tags, file_result]
-    )
-    url_process_btn.click(
-        fn=lambda x: asyncio.run(process_url_handler(x)),
-        inputs=[url_input],
-        outputs=[url_status, url_summary, url_tags, url_result]
-    )
-    search_btn.click(
-        fn=lambda x: asyncio.run(search_handler(x)),
-        inputs=[search_input],
-        outputs=[search_results, search_status]
-    )
-# Launch configuration
 if __name__ == "__main__":
-    # Check if running as MCP server
-    import sys
-    if "--mcp" in sys.argv:
-        # Run as MCP server
-        import asyncio
-        asyncio.run(mcp.run())
-    else:
-        # Run as Gradio app
-        demo.launch(
-            server_name="0.0.0.0",
-            share=False,
-            show_error=True
-        )

 import gradio as gr
+import os
 import asyncio
 import json
 import logging
+import tempfile
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import nest_asyncio
+# Apply nest_asyncio to handle nested event loops in Gradio
+nest_asyncio.apply()
+# Import our custom modules
+from mcp_tools.ingestion_tool import IngestionTool
+from mcp_tools.search_tool import SearchTool
+from mcp_tools.generative_tool import GenerativeTool
+from services.vector_store_service import VectorStoreService
+from services.document_store_service import DocumentStoreService
+from services.embedding_service import EmbeddingService
+from services.llm_service import LLMService
+from services.ocr_service import OCRService
+from core.models import SearchResult, Document
+import config
+# Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class ContentOrganizerMCPServer:
+    def __init__(self):
+        # Initialize services
+        logger.info("Initializing Content Organizer MCP Server...")
+        self.vector_store = VectorStoreService()
+        self.document_store = DocumentStoreService()
+        self.embedding_service = EmbeddingService()
+        self.llm_service = LLMService()
+        self.ocr_service = OCRService()
+        # Initialize tools
+        self.ingestion_tool = IngestionTool(
+            vector_store=self.vector_store,
+            document_store=self.document_store,
+            embedding_service=self.embedding_service,
+            ocr_service=self.ocr_service
+        )
+        self.search_tool = SearchTool(
+            vector_store=self.vector_store,
+            embedding_service=self.embedding_service,
+            document_store=self.document_store
+        )
+        self.generative_tool = GenerativeTool(
+            llm_service=self.llm_service,
+            search_tool=self.search_tool
+        )
+        # Track processing status
+        self.processing_status = {}
+        # Document cache for quick access
+        self.document_cache = {}
+        logger.info("Content Organizer MCP Server initialized successfully!")
+    def run_async(self, coro):
+        """Helper to run async functions in Gradio"""
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        if loop.is_running():
+            # If loop is already running, create a task
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(asyncio.run, coro)
+                return future.result()
+        else:
+            return loop.run_until_complete(coro)
+    async def ingest_document_async(self, file_path: str, file_type: str) -> Dict[str, Any]:
+        """MCP Tool: Ingest and process a document"""
+        try:
+            task_id = str(uuid.uuid4())
+            self.processing_status[task_id] = {"status": "processing", "progress": 0}
+            result = await self.ingestion_tool.process_document(file_path, file_type, task_id)
+            if result.get("success"):
+                self.processing_status[task_id] = {"status": "completed", "progress": 100}
+                # Update document cache
+                doc_id = result.get("document_id")
+                if doc_id:
+                    doc = await self.document_store.get_document(doc_id)
+                    if doc:
+                        self.document_cache[doc_id] = doc
+                return result
+            else:
+                self.processing_status[task_id] = {"status": "failed", "error": result.get("error")}
+                return result
+        except Exception as e:
+            logger.error(f"Document ingestion failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "message": "Failed to process document"
+            }
+    async def get_document_content_async(self, document_id: str) -> Optional[str]:
+        """Get document content by ID"""
+        try:
+            # Check cache first
+            if document_id in self.document_cache:
+                return self.document_cache[document_id].content
+            # Get from store
+            doc = await self.document_store.get_document(document_id)
+            if doc:
+                self.document_cache[document_id] = doc
+                return doc.content
+            return None
+        except Exception as e:
+            logger.error(f"Error getting document content: {str(e)}")
+            return None
+    async def semantic_search_async(self, query: str, top_k: int = 5, filters: Optional[Dict] = None) -> Dict[str, Any]:
+        """MCP Tool: Perform semantic search"""
+        try:
+            results = await self.search_tool.search(query, top_k, filters)
+            return {
+                "success": True,
+                "query": query,
+                "results": [result.to_dict() for result in results],
+                "total_results": len(results)
+            }
+        except Exception as e:
+            logger.error(f"Semantic search failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "query": query,
+                "results": []
+            }
+    async def summarize_content_async(self, content: str = None, document_id: str = None, style: str = "concise") -> Dict[str, Any]:
+        """MCP Tool: Summarize content or document"""
+        try:
+            # If document_id provided, get content from document
+            if document_id and document_id != "none":
+                content = await self.get_document_content_async(document_id)
+                if not content:
+                    return {"success": False, "error": f"Document {document_id} not found"}
+            if not content or not content.strip():
+                return {"success": False, "error": "No content provided for summarization"}
+            # Truncate content if too long (for API limits)
+            max_content_length = 4000
+            if len(content) > max_content_length:
+                content = content[:max_content_length] + "..."
+            summary = await self.generative_tool.summarize(content, style)
+            return {
+                "success": True,
+                "summary": summary,
+                "original_length": len(content),
+                "summary_length": len(summary),
+                "style": style,
+                "document_id": document_id
+            }
+        except Exception as e:
+            logger.error(f"Summarization failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+    async def generate_tags_async(self, content: str = None, document_id: str = None, max_tags: int = 5) -> Dict[str, Any]:
+        """MCP Tool: Generate tags for content"""
+        try:
+            # If document_id provided, get content from document
+            if document_id and document_id != "none":
+                content = await self.get_document_content_async(document_id)
+                if not content:
+                    return {"success": False, "error": f"Document {document_id} not found"}
+            if not content or not content.strip():
+                return {"success": False, "error": "No content provided for tag generation"}
+            tags = await self.generative_tool.generate_tags(content, max_tags)
+            # Update document tags if document_id provided
+            if document_id and document_id != "none" and tags:
+                await self.document_store.update_document_metadata(document_id, {"tags": tags})
+            return {
+                "success": True,
+                "tags": tags,
+                "content_length": len(content),
+                "document_id": document_id
+            }
+        except Exception as e:
+            logger.error(f"Tag generation failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+    async def answer_question_async(self, question: str, context_filter: Optional[Dict] = None) -> Dict[str, Any]:
+        """MCP Tool: Answer questions using RAG"""
+        try:
+            # Search for relevant context
+            search_results = await self.search_tool.search(question, top_k=5, filters=context_filter)
+            if not search_results:
+                return {
+                    "success": False,
+                    "error": "No relevant context found in your documents. Please make sure you have uploaded relevant documents.",
+                    "question": question
+                }
+            # Generate answer using context
+            answer = await self.generative_tool.answer_question(question, search_results)
+            return {
+                "success": True,
+                "question": question,
+                "answer": answer,
+                "sources": [result.to_dict() for result in search_results],
+                "confidence": "high" if len(search_results) >= 3 else "medium"
+            }
+        except Exception as e:
+            logger.error(f"Question answering failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "question": question
+            }
+    def list_documents_sync(self, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
+        """List stored documents"""
+        try:
+            documents = self.run_async(self.document_store.list_documents(limit, offset))
+            return {
+                "success": True,
+                "documents": [doc.to_dict() for doc in documents],
+                "total": len(documents)
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e)
+            }
+# Initialize the MCP server
+mcp_server = ContentOrganizerMCPServer()
+# Helper functions
+def get_document_list():
+    """Get list of documents for display"""
+    try:
+        result = mcp_server.list_documents_sync(limit=100)
+        if result["success"]:
+            if result["documents"]:
+                doc_list = "📚 Documents in Library:\n\n"
+                for i, doc in enumerate(result["documents"], 1):
+                    doc_list += f"{i}. {doc['filename']} (ID: {doc['id'][:8]}...)\n"
+                    doc_list += f"   Type: {doc['doc_type']}, Size: {doc['file_size']} bytes\n"
+                    if doc.get('tags'):
+                        doc_list += f"   Tags: {', '.join(doc['tags'])}\n"
+                    doc_list += f"   Created: {doc['created_at'][:10]}\n\n"
+                return doc_list
+            else:
+                return "No documents in library yet. Upload some documents to get started!"
+        else:
+            return f"Error loading documents: {result['error']}"
+    except Exception as e:
+        return f"Error: {str(e)}"
+def get_document_choices():
+    """Get document choices for dropdown"""
+    try:
+        result = mcp_server.list_documents_sync(limit=100)
+        if result["success"] and result["documents"]:
+            choices = []
+            for doc in result["documents"]:
+                # Create label with filename and shortened ID
+                choice_label = f"{doc['filename']} ({doc['id'][:8]}...)"
+                # Use full document ID as the value
+                choices.append((choice_label, doc['id']))
+            logger.info(f"Generated {len(choices)} document choices")
+            return choices
+        return []
+    except Exception as e:
+        logger.error(f"Error getting document choices: {str(e)}")
+        return []
+# Gradio Interface Functions
+def upload_and_process_file(file):
+    """Gradio interface for file upload"""
     if file is None:
+        return "No file uploaded", "", get_document_list(), gr.update(choices=get_document_choices())
     try:
+        # Get file path
+        file_path = file.name if hasattr(file, 'name') else str(file)
+        file_type = Path(file_path).suffix.lower()
+        logger.info(f"Processing file: {file_path}")
+        # Process document
+        result = mcp_server.run_async(mcp_server.ingest_document_async(file_path, file_type))
+        if result["success"]:
+            # Get updated document list and choices
+            doc_list = get_document_list()
+            doc_choices = get_document_choices()
             return (
+                f"✅ Success: {result['message']}\nDocument ID: {result['document_id']}\nChunks created: {result['chunks_created']}",
+                result["document_id"],
+                doc_list,
+                gr.update(choices=doc_choices),
+                gr.update(choices=doc_choices),
+                gr.update(choices=doc_choices)
             )
         else:
+            return (
+                f"❌ Error: {result.get('error', 'Unknown error')}",
+                "",
+                get_document_list(),
+                gr.update(choices=get_document_choices()),
+                gr.update(choices=get_document_choices()),
+                gr.update(choices=get_document_choices())
+            )
     except Exception as e:
+        logger.error(f"Error processing file: {str(e)}")
+        return (
+            f"❌ Error: {str(e)}",
+            "",
+            get_document_list(),
+            gr.update(choices=get_document_choices()),
+            gr.update(choices=get_document_choices()),
+            gr.update(choices=get_document_choices())
+        )
+def perform_search(query, top_k):
+    """Gradio interface for search"""
+    if not query.strip():
+        return "Please enter a search query"
     try:
+        result = mcp_server.run_async(mcp_server.semantic_search_async(query, int(top_k)))
+        if result["success"]:
+            if result["results"]:
+                output = f"🔍 Found {result['total_results']} results for: '{query}'\n\n"
+                for i, res in enumerate(result["results"], 1):
+                    output += f"Result {i}:\n"
+                    output += f"📊 Relevance Score: {res['score']:.3f}\n"
+                    output += f"📄 Content: {res['content'][:300]}...\n"
+                    if 'document_filename' in res.get('metadata', {}):
+                        output += f"📁 Source: {res['metadata']['document_filename']}\n"
+                    output += f"🔗 Document ID: {res.get('document_id', 'Unknown')}\n"
+                    output += "-" * 80 + "\n\n"
+                return output
+            else:
+                return f"No results found for: '{query}'\n\nMake sure you have uploaded relevant documents first."
         else:
+            return f"❌ Search failed: {result['error']}"
     except Exception as e:
+        logger.error(f"Search error: {str(e)}")
+        return f"❌ Error: {str(e)}"
+def summarize_document(doc_choice, custom_text, style):
+    """Gradio interface for summarization"""
     try:
+        # Debug logging
+        logger.info(f"Summarize called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
+        # Get document ID from dropdown choice
+        document_id = None
+        if doc_choice and doc_choice != "none" and doc_choice != "":
+            # When Gradio dropdown returns a choice, it returns the value part of the (label, value) tuple
+            document_id = doc_choice
+            logger.info(f"Using document ID: {document_id}")
+        # Use custom text if provided, otherwise use document
+        if custom_text and custom_text.strip():
+            logger.info("Using custom text for summarization")
+            result = mcp_server.run_async(mcp_server.summarize_content_async(content=custom_text, style=style))
+        elif document_id:
+            logger.info(f"Summarizing document: {document_id}")
+            result = mcp_server.run_async(mcp_server.summarize_content_async(document_id=document_id, style=style))
         else:
+            return "Please select a document from the dropdown or enter text to summarize"
+        if result["success"]:
+            output = f"📝 Summary ({style} style):\n\n{result['summary']}\n\n"
+            output += f"📊 Statistics:\n"
+            output += f"- Original length: {result['original_length']} characters\n"
+            output += f"- Summary length: {result['summary_length']} characters\n"
+            output += f"- Compression ratio: {(1 - result['summary_length']/result['original_length'])*100:.1f}%\n"
+            if result.get('document_id'):
+                output += f"- Document ID: {result['document_id']}\n"
+            return output
+        else:
+            return f"❌ Summarization failed: {result['error']}"
     except Exception as e:
+        logger.error(f"Summarization error: {str(e)}")
+        return f"❌ Error: {str(e)}"
+def generate_tags_for_document(doc_choice, custom_text, max_tags):
+    """Gradio interface for tag generation"""
+    try:
+        # Debug logging
+        logger.info(f"Generate tags called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
+        # Get document ID from dropdown choice
+        document_id = None
+        if doc_choice and doc_choice != "none" and doc_choice != "":
+            # When Gradio dropdown returns a choice, it returns the value part of the (label, value) tuple
+            document_id = doc_choice
+            logger.info(f"Using document ID: {document_id}")
+        # Use custom text if provided, otherwise use document
+        if custom_text and custom_text.strip():
+            logger.info("Using custom text for tag generation")
+            result = mcp_server.run_async(mcp_server.generate_tags_async(content=custom_text, max_tags=int(max_tags)))
+        elif document_id:
+            logger.info(f"Generating tags for document: {document_id}")
+            result = mcp_server.run_async(mcp_server.generate_tags_async(document_id=document_id, max_tags=int(max_tags)))
+        else:
+            return "Please select a document from the dropdown or enter text to generate tags"
+        if result["success"]:
+            tags_str = ", ".join(result["tags"])
+            output = f"🏷️ Generated Tags:\n\n{tags_str}\n\n"
+            output += f"📊 Statistics:\n"
+            output += f"- Content length: {result['content_length']} characters\n"
+            output += f"- Number of tags: {len(result['tags'])}\n"
+            if result.get('document_id'):
+                output += f"- Document ID: {result['document_id']}\n"
+                output += f"\n✅ Tags have been saved to the document."
+            return output
+        else:
+            return f"❌ Tag generation failed: {result['error']}"
+    except Exception as e:
+        logger.error(f"Tag generation error: {str(e)}")
+        return f"❌ Error: {str(e)}"
+def ask_question(question):
+    """Gradio interface for Q&A"""
+    if not question.strip():
+        return "Please enter a question"
+    try:
+        result = mcp_server.run_async(mcp_server.answer_question_async(question))
+        if result["success"]:
+            output = f"❓ Question: {result['question']}\n\n"
+            output += f"💡 Answer:\n{result['answer']}\n\n"
+            output += f"🎯 Confidence: {result['confidence']}\n\n"
+            output += f"📚 Sources Used ({len(result['sources'])}):\n"
+            for i, source in enumerate(result['sources'], 1):
+                filename = source.get('metadata', {}).get('document_filename', 'Unknown')
+                output += f"\n{i}. 📄 {filename}\n"
+                output += f"   📝 Excerpt: {source['content'][:150]}...\n"
+                output += f"   📊 Relevance: {source['score']:.3f}\n"
+            return output
+        else:
+            return f"❌ {result.get('error', 'Failed to answer question')}"
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# Create Gradio Interface
+def create_gradio_interface():
+    with gr.Blocks(title="🧠 Intelligent Content Organizer MCP Agent", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""
+        # 🧠 Intelligent Content Organizer MCP Agent
+        A powerful MCP (Model Context Protocol) server for intelligent content management with semantic search,
+        summarization, and Q&A capabilities powered by Anthropic Claude and Mistral AI.
+        ## 🚀 Quick Start:
+        1. **Upload Documents** → Go to "📄 Upload Documents" tab
+        2. **Search Your Content** → Use "🔍 Search Documents" to find information
+        3. **Get Summaries** → Select any document in "📝 Summarize" tab
+        4. **Ask Questions** → Get answers from your documents in "❓ Ask Questions" tab
+        """)
+        # Shared components for document selection
+        doc_choices = gr.State(get_document_choices())
+        with gr.Tabs():
+            # Document Library Tab
+            with gr.Tab("📚 Document Library"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Your Document Collection")
+                        document_list = gr.Textbox(
+                            label="Documents in Library",
+                            value=get_document_list(),
+                            lines=20,
+                            interactive=False
+                        )
+                        refresh_btn = gr.Button("🔄 Refresh Library", variant="secondary")
+                refresh_btn.click(
+                    fn=get_document_list,
+                    outputs=[document_list]
+                )
+            # Document Ingestion Tab
+            with gr.Tab("📄 Upload Documents"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Add Documents to Your Library")
+                        file_input = gr.File(
+                            label="Select Document to Upload",
+                            file_types=[".pdf", ".txt", ".docx", ".png", ".jpg", ".jpeg"],
+                            type="filepath"
+                        )
+                        upload_btn = gr.Button("🚀 Process & Add to Library", variant="primary", size="lg")
+                    with gr.Column():
+                        upload_output = gr.Textbox(
+                            label="Processing Result",
+                            lines=6,
+                            placeholder="Upload a document to see processing results..."
+                        )
+                        doc_id_output = gr.Textbox(
+                            label="Document ID",
+                            placeholder="Document ID will appear here after processing..."
+                        )
+                # Hidden dropdowns for updating
+                doc_dropdown_sum = gr.Dropdown(label="Hidden", visible=False)
+                doc_dropdown_tag = gr.Dropdown(label="Hidden", visible=False)
+                upload_btn.click(
+                    upload_and_process_file,
+                    inputs=[file_input],
+                    outputs=[upload_output, doc_id_output, document_list, doc_dropdown_sum, doc_dropdown_tag, doc_choices]
+                )
+            # Semantic Search Tab
+            with gr.Tab("🔍 Search Documents"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### Search Your Document Library")
+                        search_query = gr.Textbox(
+                            label="What are you looking for?",
+                            placeholder="Enter your search query... (e.g., 'machine learning algorithms', 'quarterly revenue', 'project timeline')",
+                            lines=2
+                        )
+                        search_top_k = gr.Slider(
+                            label="Number of Results",
+                            minimum=1,
+                            maximum=20,
+                            value=5,
+                            step=1
+                        )
+                        search_btn = gr.Button("🔍 Search Library", variant="primary", size="lg")
+                    with gr.Column(scale=2):
+                        search_output = gr.Textbox(
+                            label="Search Results",
+                            lines=20,
+                            placeholder="Search results will appear here..."
+                        )
+                search_btn.click(
+                    perform_search,
+                    inputs=[search_query, search_top_k],
+                    outputs=[search_output]
+                )
+            # Summarization Tab
+            with gr.Tab("📝 Summarize"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Generate Document Summaries")
+                        with gr.Tab("From Library"):
+                            doc_dropdown_sum = gr.Dropdown(
+                                label="Select Document to Summarize",
+                                choices=get_document_choices(),
+                                value=None,
+                                interactive=True,
+                                allow_custom_value=False
+                            )
+                        with gr.Tab("Custom Text"):
+                            summary_text = gr.Textbox(
+                                label="Or Paste Text to Summarize",
+                                placeholder="Paste any text here to summarize...",
+                                lines=8
+                            )
+                        summary_style = gr.Dropdown(
+                            label="Summary Style",
+                            choices=["concise", "detailed", "bullet_points", "executive"],
+                            value="concise",
+                            info="Choose how you want the summary formatted"
+                        )
+                        summarize_btn = gr.Button("📝 Generate Summary", variant="primary", size="lg")
+                    with gr.Column():
+                        summary_output = gr.Textbox(
+                            label="Generated Summary",
+                            lines=20,
+                            placeholder="Summary will appear here..."
+                        )
+                summarize_btn.click(
+                    summarize_document,
+                    inputs=[doc_dropdown_sum, summary_text, summary_style],
+                    outputs=[summary_output]
+                )
+            # Tag Generation Tab
+            with gr.Tab("🏷️ Generate Tags"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Auto-Generate Document Tags")
+                        with gr.Tab("From Library"):
+                            doc_dropdown_tag = gr.Dropdown(
+                                label="Select Document to Tag",
+                                choices=get_document_choices(),
+                                value=None,
+                                interactive=True,
+                                allow_custom_value=False
+                            )
+                        with gr.Tab("Custom Text"):
+                            tag_text = gr.Textbox(
+                                label="Or Paste Text to Generate Tags",
+                                placeholder="Paste any text here to generate tags...",
+                                lines=8
+                            )
+                        max_tags = gr.Slider(
+                            label="Number of Tags",
+                            minimum=3,
+                            maximum=15,
+                            value=5,
+                            step=1
+                        )
+                        tag_btn = gr.Button("🏷️ Generate Tags", variant="primary", size="lg")
+                    with gr.Column():
+                        tag_output = gr.Textbox(
+                            label="Generated Tags",
+                            lines=10,
+                            placeholder="Tags will appear here..."
+                        )
+                tag_btn.click(
+                    generate_tags_for_document,
+                    inputs=[doc_dropdown_tag, tag_text, max_tags],
+                    outputs=[tag_output]
+                )
+            # Q&A Tab
+            with gr.Tab("❓ Ask Questions"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("""
+                        ### Ask Questions About Your Documents
+                        The AI will search through all your uploaded documents to find relevant information
+                        and provide comprehensive answers with sources.
+                        """)
+                        qa_question = gr.Textbox(
+                            label="Your Question",
+                            placeholder="Ask anything about your documents... (e.g., 'What are the key findings about renewable energy?', 'How much was spent on marketing last quarter?')",
+                            lines=3
+                        )
+                        qa_btn = gr.Button("❓ Get Answer", variant="primary", size="lg")
+                    with gr.Column():
+                        qa_output = gr.Textbox(
+                            label="AI Answer",
+                            lines=20,
+                            placeholder="Answer will appear here with sources..."
+                        )
+                qa_btn.click(
+                    ask_question,
+                    inputs=[qa_question],
+                    outputs=[qa_output]
+                )
+        # Auto-refresh document lists when switching tabs
+        interface.load(
+            fn=lambda: (get_document_list(), get_document_choices(), get_document_choices()),
+            outputs=[document_list, doc_dropdown_sum, doc_dropdown_tag]
+        )
+        return interface
+# Create and launch the interface
 if __name__ == "__main__":
+    interface = create_gradio_interface()
+    # Launch with proper configuration for Hugging Face Spaces
+    interface.launch(mcp_server=True)

core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Core module initialization

core/chunker.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import logging
+from typing import List, Dict, Any, Optional
+import re
+from .models import Chunk
+from .text_preprocessor import TextPreprocessor
+import config
+logger = logging.getLogger(__name__)
+class TextChunker:
+    def __init__(self):
+        self.config = config.config
+        self.preprocessor = TextPreprocessor()
+        self.chunk_size = self.config.CHUNK_SIZE
+        self.chunk_overlap = self.config.CHUNK_OVERLAP
+    def chunk_document(self, document_id: str, content: str, method: str = "recursive") -> List[Chunk]:
+        """Chunk a document using the specified method"""
+        if not content:
+            return []
+        try:
+            if method == "recursive":
+                return self._recursive_chunk(document_id, content)
+            elif method == "sentence":
+                return self._sentence_chunk(document_id, content)
+            elif method == "paragraph":
+                return self._paragraph_chunk(document_id, content)
+            elif method == "fixed":
+                return self._fixed_chunk(document_id, content)
+            else:
+                logger.warning(f"Unknown chunking method: {method}, using recursive")
+                return self._recursive_chunk(document_id, content)
+        except Exception as e:
+            logger.error(f"Error chunking document: {str(e)}")
+            # Fallback to simple fixed chunking
+            return self._fixed_chunk(document_id, content)
+    def _recursive_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Recursively split text by different separators"""
+        chunks = []
+        # Define separators in order of preference
+        separators = [
+            "\n\n",  # Paragraphs
+            "\n",    # Lines
+            ". ",    # Sentences
+            ", ",    # Clauses
+            " "      # Words
+        ]
+        def split_text(text: str, separators: List[str], chunk_size: int) -> List[str]:
+            if len(text) <= chunk_size:
+                return [text] if text.strip() else []
+            if not separators:
+                # If no separators left, split by character
+                return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+            separator = separators[0]
+            remaining_separators = separators[1:]
+            splits = text.split(separator)
+            result = []
+            current_chunk = ""
+            for split in splits:
+                if len(current_chunk) + len(split) + len(separator) <= chunk_size:
+                    if current_chunk:
+                        current_chunk += separator + split
+                    else:
+                        current_chunk = split
+                else:
+                    if current_chunk:
+                        result.append(current_chunk)
+                    if len(split) > chunk_size:
+                        # Split is too big, need to split further
+                        result.extend(split_text(split, remaining_separators, chunk_size))
+                        current_chunk = ""
+                    else:
+                        current_chunk = split
+            if current_chunk:
+                result.append(current_chunk)
+            return result
+        text_chunks = split_text(content, separators, self.chunk_size)
+        # Create chunk objects with overlap
+        for i, chunk_text in enumerate(text_chunks):
+            if not chunk_text.strip():
+                continue
+            # Calculate positions
+            start_pos = content.find(chunk_text)
+            if start_pos == -1:
+                start_pos = i * self.chunk_size
+            end_pos = start_pos + len(chunk_text)
+            # Add overlap from previous chunk if not the first chunk
+            if i > 0 and self.chunk_overlap > 0:
+                prev_chunk = text_chunks[i-1]
+                overlap_text = prev_chunk[-self.chunk_overlap:] if len(prev_chunk) > self.chunk_overlap else prev_chunk
+                chunk_text = overlap_text + " " + chunk_text
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, i),
+                document_id=document_id,
+                content=chunk_text.strip(),
+                chunk_index=i,
+                start_pos=start_pos,
+                end_pos=end_pos,
+                metadata={
+                    "chunk_method": "recursive",
+                    "original_length": len(chunk_text),
+                    "word_count": len(chunk_text.split())
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _sentence_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Chunk text by sentences"""
+        chunks = []
+        sentences = self.preprocessor.extract_sentences(content)
+        current_chunk = ""
+        chunk_index = 0
+        start_pos = 0
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) <= self.chunk_size:
+                if current_chunk:
+                    current_chunk += " " + sentence
+                else:
+                    current_chunk = sentence
+                    start_pos = content.find(sentence)
+            else:
+                if current_chunk:
+                    chunk = Chunk(
+                        id=self._generate_chunk_id(document_id, chunk_index),
+                        document_id=document_id,
+                        content=current_chunk.strip(),
+                        chunk_index=chunk_index,
+                        start_pos=start_pos,
+                        end_pos=start_pos + len(current_chunk),
+                        metadata={
+                            "chunk_method": "sentence",
+                            "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
+                        }
+                    )
+                    chunks.append(chunk)
+                    chunk_index += 1
+                current_chunk = sentence
+                start_pos = content.find(sentence)
+        # Add final chunk
+        if current_chunk:
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, chunk_index),
+                document_id=document_id,
+                content=current_chunk.strip(),
+                chunk_index=chunk_index,
+                start_pos=start_pos,
+                end_pos=start_pos + len(current_chunk),
+                metadata={
+                    "chunk_method": "sentence",
+                    "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _paragraph_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Chunk text by paragraphs"""
+        chunks = []
+        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
+        current_chunk = ""
+        chunk_index = 0
+        start_pos = 0
+        for paragraph in paragraphs:
+            if len(current_chunk) + len(paragraph) <= self.chunk_size:
+                if current_chunk:
+                    current_chunk += "\n\n" + paragraph
+                else:
+                    current_chunk = paragraph
+                    start_pos = content.find(paragraph)
+            else:
+                if current_chunk:
+                    chunk = Chunk(
+                        id=self._generate_chunk_id(document_id, chunk_index),
+                        document_id=document_id,
+                        content=current_chunk.strip(),
+                        chunk_index=chunk_index,
+                        start_pos=start_pos,
+                        end_pos=start_pos + len(current_chunk),
+                        metadata={
+                            "chunk_method": "paragraph",
+                            "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
+                        }
+                    )
+                    chunks.append(chunk)
+                    chunk_index += 1
+                # If paragraph is too long, split it further
+                if len(paragraph) > self.chunk_size:
+                    para_chunks = self._fixed_chunk(document_id, paragraph)
+                    for pc in para_chunks:
+                        pc.chunk_index = chunk_index
+                        pc.id = self._generate_chunk_id(document_id, chunk_index)
+                        chunks.append(pc)
+                        chunk_index += 1
+                else:
+                    current_chunk = paragraph
+                    start_pos = content.find(paragraph)
+        # Add final chunk
+        if current_chunk:
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, chunk_index),
+                document_id=document_id,
+                content=current_chunk.strip(),
+                chunk_index=chunk_index,
+                start_pos=start_pos,
+                end_pos=start_pos + len(current_chunk),
+                metadata={
+                    "chunk_method": "paragraph",
+                    "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _fixed_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Simple fixed-size chunking with overlap"""
+        chunks = []
+        for i in range(0, len(content), self.chunk_size - self.chunk_overlap):
+            chunk_text = content[i:i + self.chunk_size]
+            if not chunk_text.strip():
+                continue
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, len(chunks)),
+                document_id=document_id,
+                content=chunk_text.strip(),
+                chunk_index=len(chunks),
+                start_pos=i,
+                end_pos=min(i + self.chunk_size, len(content)),
+                metadata={
+                    "chunk_method": "fixed",
+                    "original_length": len(chunk_text)
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _generate_chunk_id(self, document_id: str, chunk_index: int) -> str:
+        """Generate a unique chunk ID"""
+        return f"{document_id}_chunk_{chunk_index}"
+    def optimize_chunks_for_embedding(self, chunks: List[Chunk]) -> List[Chunk]:
+        """Optimize chunks for better embedding generation"""
+        optimized_chunks = []
+        for chunk in chunks:
+            # Clean the content for embedding
+            clean_content = self.preprocessor.prepare_for_embedding(chunk.content)
+            # Skip very short chunks
+            if len(clean_content.split()) < 5:
+                continue
+            # Update chunk with optimized content
+            optimized_chunk = Chunk(
+                id=chunk.id,
+                document_id=chunk.document_id,
+                content=clean_content,
+                chunk_index=chunk.chunk_index,
+                start_pos=chunk.start_pos,
+                end_pos=chunk.end_pos,
+                metadata={
+                    **chunk.metadata,
+                    "optimized_for_embedding": True,
+                    "original_content_length": len(chunk.content),
+                    "optimized_content_length": len(clean_content)
+                }
+            )
+            optimized_chunks.append(optimized_chunk)
+        return optimized_chunks

core/document_parser.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import logging
+import tempfile
+import os
+from pathlib import Path
+from typing import Optional, Dict, Any
+import asyncio
+# Document processing libraries
+import PyPDF2
+from docx import Document as DocxDocument
+from PIL import Image
+import pytesseract
+from .models import Document, DocumentType
+import config
+logger = logging.getLogger(__name__)
+class DocumentParser:
+    def __init__(self):
+        self.config = config.config
+    async def parse_document(self, file_path: str, filename: str) -> Document:
+        """Parse a document and extract its content"""
+        try:
+            file_ext = Path(filename).suffix.lower()
+            file_size = os.path.getsize(file_path)
+            # Determine document type and parse accordingly
+            if file_ext == '.pdf':
+                content = await self._parse_pdf(file_path)
+                doc_type = DocumentType.PDF
+            elif file_ext == '.txt':
+                content = await self._parse_text(file_path)
+                doc_type = DocumentType.TEXT
+            elif file_ext == '.docx':
+                content = await self._parse_docx(file_path)
+                doc_type = DocumentType.DOCX
+            elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
+                content = await self._parse_image(file_path)
+                doc_type = DocumentType.IMAGE
+            else:
+                raise ValueError(f"Unsupported file type: {file_ext}")
+            # Create document object
+            document = Document(
+                id=self._generate_document_id(),
+                filename=filename,
+                content=content,
+                doc_type=doc_type,
+                file_size=file_size,
+                metadata={
+                    "file_extension": file_ext,
+                    "content_length": len(content),
+                    "word_count": len(content.split()) if content else 0
+                }
+            )
+            logger.info(f"Successfully parsed document: {filename}")
+            return document
+        except Exception as e:
+            logger.error(f"Error parsing document {filename}: {str(e)}")
+            raise
+    async def _parse_pdf(self, file_path: str) -> str:
+        """Extract text from PDF file"""
+        try:
+            content = ""
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page_num, page in enumerate(pdf_reader.pages):
+                    try:
+                        page_text = page.extract_text()
+                        if page_text.strip():
+                            content += f"\n--- Page {page_num + 1} ---\n"
+                            content += page_text + "\n"
+                    except Exception as e:
+                        logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
+                        continue
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error parsing PDF: {str(e)}")
+            raise
+    async def _parse_text(self, file_path: str) -> str:
+        """Read plain text file"""
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                content = file.read()
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error parsing text file: {str(e)}")
+            raise
+    async def _parse_docx(self, file_path: str) -> str:
+        """Extract text from DOCX file"""
+        try:
+            doc = DocxDocument(file_path)
+            content = ""
+            for paragraph in doc.paragraphs:
+                if paragraph.text.strip():
+                    content += paragraph.text + "\n"
+            # Extract text from tables
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = []
+                    for cell in row.cells:
+                        if cell.text.strip():
+                            row_text.append(cell.text.strip())
+                    if row_text:
+                        content += " | ".join(row_text) + "\n"
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error parsing DOCX file: {str(e)}")
+            raise
+    async def _parse_image(self, file_path: str) -> str:
+        """Extract text from image using OCR"""
+        try:
+            # First try with OCR service if available
+            if hasattr(self, 'ocr_service') and self.ocr_service:
+                logger.info(f"Using OCR service for image: {file_path}")
+                text = await self.ocr_service.extract_text_from_image(file_path)
+                if text:
+                    return text
+            # Fallback to direct pytesseract
+            logger.info(f"Using direct pytesseract for image: {file_path}")
+            image = Image.open(file_path)
+            # Perform OCR
+            content = pytesseract.image_to_string(
+                image,
+                lang=self.config.OCR_LANGUAGE,
+                config='--psm 6'  # Assume a single uniform block of text
+            )
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error performing OCR on image: {str(e)}")
+            # Return empty string if OCR fails
+            return ""
+    def _generate_document_id(self) -> str:
+        """Generate a unique document ID"""
+        import uuid
+        return str(uuid.uuid4())
+    async def extract_metadata(self, file_path: str, content: str) -> Dict[str, Any]:
+        """Extract additional metadata from the document"""
+        try:
+            metadata = {}
+            # Basic statistics
+            metadata["content_length"] = len(content)
+            metadata["word_count"] = len(content.split()) if content else 0
+            metadata["line_count"] = len(content.splitlines()) if content else 0
+            # File information
+            file_stat = os.stat(file_path)
+            metadata["file_size"] = file_stat.st_size
+            metadata["created_time"] = file_stat.st_ctime
+            metadata["modified_time"] = file_stat.st_mtime
+            # Content analysis
+            if content:
+                # Language detection (simple heuristic)
+                metadata["estimated_language"] = self._detect_language(content)
+                # Reading time estimation (average 200 words per minute)
+                metadata["estimated_reading_time_minutes"] = max(1, metadata["word_count"] // 200)
+            return metadata
+        except Exception as e:
+            logger.error(f"Error extracting metadata: {str(e)}")
+            return {}
+    def _detect_language(self, content: str) -> str:
+        """Simple language detection based on character patterns"""
+        # This is a very basic implementation
+        # In production, you might want to use a proper language detection library
+        if not content:
+            return "unknown"
+        # Count common English words
+        english_words = ["the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "this", "that", "these", "those"]
+        words = content.lower().split()
+        english_count = sum(1 for word in words if word in english_words)
+        if len(words) > 0 and english_count / len(words) > 0.1:
+            return "en"
+        else:
+            return "unknown"

core/models.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+from enum import Enum
+class DocumentType(str, Enum):
+    PDF = "pdf"
+    TEXT = "txt"
+    DOCX = "docx"
+    IMAGE = "image"
+    HTML = "html"
+class ProcessingStatus(str, Enum):
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+class Document(BaseModel):
+    id: str = Field(..., description="Unique document identifier")
+    filename: str = Field(..., description="Original filename")
+    content: str = Field(..., description="Extracted text content")
+    doc_type: DocumentType = Field(..., description="Document type")
+    file_size: int = Field(..., description="File size in bytes")
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    tags: List[str] = Field(default_factory=list)
+    summary: Optional[str] = None
+    category: Optional[str] = None
+    language: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "id": self.id,
+            "filename": self.filename,
+            "content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
+            "doc_type": self.doc_type,
+            "file_size": self.file_size,
+            "created_at": self.created_at.isoformat(),
+            "metadata": self.metadata,
+            "tags": self.tags,
+            "summary": self.summary,
+            "category": self.category,
+            "language": self.language
+        }
+class Chunk(BaseModel):
+    id: str = Field(..., description="Unique chunk identifier")
+    document_id: str = Field(..., description="Parent document ID")
+    content: str = Field(..., description="Chunk text content")
+    chunk_index: int = Field(..., description="Position in document")
+    start_pos: int = Field(..., description="Start position in original document")
+    end_pos: int = Field(..., description="End position in original document")
+    embedding: Optional[List[float]] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class SearchResult(BaseModel):
+    chunk_id: str = Field(..., description="Matching chunk ID")
+    document_id: str = Field(..., description="Source document ID")
+    content: str = Field(..., description="Matching content")
+    score: float = Field(..., description="Similarity score")
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "chunk_id": self.chunk_id,
+            "document_id": self.document_id,
+            "content": self.content,
+            "score": self.score,
+            "metadata": self.metadata
+        }
+class ProcessingTask(BaseModel):
+    task_id: str = Field(..., description="Unique task identifier")
+    document_id: Optional[str] = None
+    status: ProcessingStatus = ProcessingStatus.PENDING
+    progress: float = Field(default=0.0, ge=0.0, le=100.0)
+    message: Optional[str] = None
+    error: Optional[str] = None
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+class SummaryRequest(BaseModel):
+    content: Optional[str] = None
+    document_id: Optional[str] = None
+    style: str = Field(default="concise", description="Summary style")
+    max_length: Optional[int] = None
+class TagGenerationRequest(BaseModel):
+    content: Optional[str] = None
+    document_id: Optional[str] = None
+    max_tags: int = Field(default=5, ge=1, le=20)
+class QuestionAnswerRequest(BaseModel):
+    question: str = Field(..., description="Question to answer")
+    context_filter: Optional[Dict[str, Any]] = None
+    max_context_length: int = Field(default=2000)
+class CategorizationRequest(BaseModel):
+    content: Optional[str] = None
+    document_id: Optional[str] = None
+    categories: Optional[List[str]] = None

core/text_preprocessor.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import re
+import logging
+from typing import List, Optional
+import unicodedata
+logger = logging.getLogger(__name__)
+class TextPreprocessor:
+    def __init__(self):
+        # Common stop words for basic filtering
+        self.stop_words = {
+            'en': set([
+                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+                'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
+                'before', 'after', 'above', 'below', 'between', 'among', 'throughout',
+                'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
+                'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
+                'must', 'shall', 'can', 'this', 'that', 'these', 'those', 'i', 'me',
+                'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours'
+            ])
+        }
+    def clean_text(self, text: str, aggressive: bool = False) -> str:
+        """Clean and normalize text"""
+        if not text:
+            return ""
+        try:
+            # Normalize unicode characters
+            text = unicodedata.normalize('NFKD', text)
+            # Remove excessive whitespace
+            text = re.sub(r'\s+', ' ', text)
+            # Remove or replace special characters
+            if aggressive:
+                # More aggressive cleaning for embedding
+                text = re.sub(r'[^\w\s\-.,!?;:]', ' ', text)
+                text = re.sub(r'[.,!?;:]+', '.', text)
+            else:
+                # Basic cleaning for readability
+                text = re.sub(r'[^\w\s\-.,!?;:()\[\]{}"\']', ' ', text)
+            # Remove excessive punctuation
+            text = re.sub(r'\.{2,}', '.', text)
+            text = re.sub(r'[!?]{2,}', '!', text)
+            # Clean up whitespace again
+            text = re.sub(r'\s+', ' ', text)
+            # Remove leading/trailing whitespace
+            text = text.strip()
+            return text
+        except Exception as e:
+            logger.error(f"Error cleaning text: {str(e)}")
+            return text
+    def extract_sentences(self, text: str) -> List[str]:
+        """Extract sentences from text"""
+        if not text:
+            return []
+        try:
+            # Simple sentence splitting
+            sentences = re.split(r'[.!?]+', text)
+            # Clean and filter sentences
+            clean_sentences = []
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if len(sentence) > 10:  # Minimum sentence length
+                    clean_sentences.append(sentence)
+            return clean_sentences
+        except Exception as e:
+            logger.error(f"Error extracting sentences: {str(e)}")
+            return [text]
+    def extract_keywords(self, text: str, language: str = 'en', max_keywords: int = 20) -> List[str]:
+        """Extract potential keywords from text"""
+        if not text:
+            return []
+        try:
+            # Convert to lowercase and split into words
+            words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
+            # Remove stop words
+            stop_words = self.stop_words.get(language, set())
+            keywords = [word for word in words if word not in stop_words]
+            # Count word frequency
+            word_freq = {}
+            for word in keywords:
+                word_freq[word] = word_freq.get(word, 0) + 1
+            # Sort by frequency and return top keywords
+            sorted_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+            return [word for word, freq in sorted_keywords[:max_keywords]]
+        except Exception as e:
+            logger.error(f"Error extracting keywords: {str(e)}")
+            return []
+    def prepare_for_embedding(self, text: str) -> str:
+        """Prepare text specifically for embedding generation"""
+        if not text:
+            return ""
+        try:
+            # Clean text aggressively for better embeddings
+            clean_text = self.clean_text(text, aggressive=True)
+            # Remove very short words
+            words = clean_text.split()
+            filtered_words = [word for word in words if len(word) >= 2]
+            # Rejoin and ensure reasonable length
+            result = ' '.join(filtered_words)
+            # Truncate if too long (most embedding models have token limits)
+            if len(result) > 5000:  # Rough character limit
+                result = result[:5000] + "..."
+            return result
+        except Exception as e:
+            logger.error(f"Error preparing text for embedding: {str(e)}")
+            return text
+    def extract_metadata_from_text(self, text: str) -> dict:
+        """Extract metadata from text content"""
+        if not text:
+            return {}
+        try:
+            metadata = {}
+            # Basic statistics
+            metadata['character_count'] = len(text)
+            metadata['word_count'] = len(text.split())
+            metadata['sentence_count'] = len(self.extract_sentences(text))
+            metadata['paragraph_count'] = len([p for p in text.split('\n\n') if p.strip()])
+            # Content characteristics
+            metadata['avg_word_length'] = sum(len(word) for word in text.split()) / max(1, len(text.split()))
+            metadata['avg_sentence_length'] = metadata['word_count'] / max(1, metadata['sentence_count'])
+            # Special content detection
+            metadata['has_urls'] = bool(re.search(r'https?://\S+', text))
+            metadata['has_emails'] = bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
+            metadata['has_phone_numbers'] = bool(re.search(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text))
+            metadata['has_dates'] = bool(re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text))
+            metadata['has_numbers'] = bool(re.search(r'\b\d+\b', text))
+            # Language indicators
+            metadata['punctuation_density'] = len(re.findall(r'[.,!?;:]', text)) / max(1, len(text))
+            metadata['caps_ratio'] = len(re.findall(r'[A-Z]', text)) / max(1, len(text))
+            return metadata
+        except Exception as e:
+            logger.error(f"Error extracting text metadata: {str(e)}")
+            return {}
+    def normalize_for_search(self, text: str) -> str:
+        """Normalize text for search queries"""
+        if not text:
+            return ""
+        try:
+            # Convert to lowercase
+            text = text.lower()
+            # Remove special characters but keep spaces
+            text = re.sub(r'[^\w\s]', ' ', text)
+            # Normalize whitespace
+            text = re.sub(r'\s+', ' ', text)
+            # Strip leading/trailing whitespace
+            text = text.strip()
+            return text
+        except Exception as e:
+            logger.error(f"Error normalizing text for search: {str(e)}")
+            return text

mcp_server.py CHANGED Viewed

@@ -1,108 +1,203 @@
-from mcp.server.fastmcp import FastMCP
-import json
-from typing import Dict, List, Any
 import logging
-# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize MCP server
-mcp = FastMCP("intelligent-content-organizer")
 @mcp.tool()
-async def process_file(file_path: str) -> Dict[str, Any]:
     """
-    Process a local file and extract content, generate tags, and create embeddings
     """
     try:
-        from mcp_tools import process_local_file
-        result = await process_local_file(file_path)
         return result
     except Exception as e:
-        logger.error(f"Error processing file: {str(e)}")
-        return {"error": str(e)}
 @mcp.tool()
-async def process_url(url: str) -> Dict[str, Any]:
     """
-    Fetch and process content from a URL
     """
     try:
-        from mcp_tools import process_web_content
-        result = await process_web_content(url)
-        return result
     except Exception as e:
-        logger.error(f"Error processing URL: {str(e)}")
-        return {"error": str(e)}
 @mcp.tool()
-async def semantic_search(query: str, limit: int = 5) -> List[Dict[str, Any]]:
     """
-    Perform semantic search across stored documents
     """
     try:
-        from mcp_tools import search_knowledge_base
-        results = await search_knowledge_base(query, limit)
-        return results
     except Exception as e:
-        logger.error(f"Error performing search: {str(e)}")
-        return [{"error": str(e)}]
 @mcp.tool()
-async def get_document_summary(doc_id: str) -> Dict[str, Any]:
     """
-    Get summary and metadata for a specific document
     """
     try:
-        from mcp_tools import get_document_details
-        result = await get_document_details(doc_id)
-        return result
     except Exception as e:
-        logger.error(f"Error getting document summary: {str(e)}")
-        return {"error": str(e)}
 @mcp.tool()
-async def get_server_info() -> Dict[str, Any]:
     """
-    Get information about this MCP server
     """
-    return {
-        "name": "Intelligent Content Organizer",
-        "version": "1.0.0",
-        "description": "AI-powered knowledge management system with automatic tagging and semantic search",
-        "capabilities": [
-            "File processing (20+ formats)",
-            "Web content extraction",
-            "Automatic tagging",
-            "Semantic search",
-            "Document summarization"
-        ],
-        "tools": [
-            {
-                "name": "process_file",
-                "description": "Process local files and extract content"
-            },
-            {
-                "name": "process_url",
-                "description": "Fetch and process web content"
-            },
-            {
-                "name": "semantic_search",
-                "description": "Search across stored documents"
-            },
-            {
-                "name": "get_document_summary",
-                "description": "Get document details"
-            },
-            {
-                "name": "get_server_info",
-                "description": "Get server information"
             }
-        ]
-    }
 if __name__ == "__main__":
-    # Run the MCP server
-    import asyncio
     asyncio.run(mcp.run())

+import asyncio
 import logging
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+from mcp.server.fastmcp import FastMCP
+from services.vector_store_service import VectorStoreService
+from services.document_store_service import DocumentStoreService
+from services.embedding_service import EmbeddingService
+from services.llm_service import LLMService
+from services.ocr_service import OCRService
+from mcp_tools.ingestion_tool import IngestionTool
+from mcp_tools.search_tool import SearchTool
+from mcp_tools.generative_tool import GenerativeTool
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+logger.info("Initializing services for FastMCP...")
+vector_store_service = VectorStoreService()
+document_store_service = DocumentStoreService()
+embedding_service_instance = EmbeddingService()
+llm_service_instance = LLMService()
+ocr_service_instance = OCRService()
+ingestion_tool_instance = IngestionTool(
+    vector_store=vector_store_service,
+    document_store=document_store_service,
+    embedding_service=embedding_service_instance,
+    ocr_service=ocr_service_instance
+)
+search_tool_instance = SearchTool(
+    vector_store=vector_store_service,
+    embedding_service=embedding_service_instance,
+    document_store=document_store_service
+)
+generative_tool_instance = GenerativeTool(
+    llm_service=llm_service_instance,
+    search_tool=search_tool_instance
+)
+mcp = FastMCP("intelligent-content-organizer-fmcp")
+logger.info("FastMCP server initialized.")
 @mcp.tool()
+async def ingest_document(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
     """
+    Process and index a document from a local file path for searching.
+    Automatically determines file_type if not provided.
     """
+    logger.info(f"Tool 'ingest_document' called with file_path: {file_path}, file_type: {file_type}")
     try:
+        actual_file_type = file_type
+        if not actual_file_type:
+            actual_file_type = Path(file_path).suffix.lower().strip('.')
+            logger.info(f"Inferred file_type: {actual_file_type}")
+        result = await ingestion_tool_instance.process_document(file_path, actual_file_type)
+        logger.info(f"Ingestion result: {result}")
         return result
     except Exception as e:
+        logger.error(f"Error in 'ingest_document' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
 @mcp.tool()
+async def semantic_search(query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     """
+    Search through indexed content using natural language.
+    'filters' can be used to narrow down the search.
     """
+    logger.info(f"Tool 'semantic_search' called with query: {query}, top_k: {top_k}, filters: {filters}")
     try:
+        results = await search_tool_instance.search(query, top_k, filters)
+        return {
+            "success": True,
+            "query": query,
+            "results": [result.to_dict() for result in results],
+            "total_results": len(results)
+        }
     except Exception as e:
+        logger.error(f"Error in 'semantic_search' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e), "results": []}
 @mcp.tool()
+async def summarize_content(
+    content: Optional[str] = None,
+    document_id: Optional[str] = None,
+    style: str = "concise"
+) -> Dict[str, Any]:
     """
+    Generate a summary of provided content or a document_id.
+    Available styles: concise, detailed, bullet_points, executive.
     """
+    logger.info(f"Tool 'summarize_content' called. doc_id: {document_id}, style: {style}, has_content: {content is not None}")
     try:
+        text_to_summarize = content
+        if document_id and not text_to_summarize:
+            doc = await document_store_service.get_document(document_id)
+            if not doc:
+                return {"success": False, "error": f"Document {document_id} not found"}
+            text_to_summarize = doc.content
+        if not text_to_summarize:
+            return {"success": False, "error": "No content provided for summarization"}
+        max_length = 10000
+        if len(text_to_summarize) > max_length:
+            logger.warning(f"Content for summarization is long ({len(text_to_summarize)} chars), truncating to {max_length}")
+            text_to_summarize = text_to_summarize[:max_length] + "..."
+        summary = await generative_tool_instance.summarize(text_to_summarize, style)
+        return {
+            "success": True,
+            "summary": summary,
+            "original_length": len(text_to_summarize),
+            "summary_length": len(summary),
+            "style": style
+        }
     except Exception as e:
+        logger.error(f"Error in 'summarize_content' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
 @mcp.tool()
+async def generate_tags(
+    content: Optional[str] = None,
+    document_id: Optional[str] = None,
+    max_tags: int = 5
+) -> Dict[str, Any]:
     """
+    Generate relevant tags for content or a document_id.
+    Saves tags to document metadata if document_id is provided.
     """
+    logger.info(f"Tool 'generate_tags' called. doc_id: {document_id}, max_tags: {max_tags}, has_content: {content is not None}")
     try:
+        text_for_tags = content
+        if document_id and not text_for_tags:
+            doc = await document_store_service.get_document(document_id)
+            if not doc:
+                return {"success": False, "error": f"Document {document_id} not found"}
+            text_for_tags = doc.content
+        if not text_for_tags:
+            return {"success": False, "error": "No content provided for tag generation"}
+        tags = await generative_tool_instance.generate_tags(text_for_tags, max_tags)
+        if document_id and tags:
+            await document_store_service.update_document_metadata(document_id, {"tags": tags})
+            logger.info(f"Tags {tags} saved for document {document_id}")
+        return {
+            "success": True,
+            "tags": tags,
+            "content_length": len(text_for_tags),
+            "document_id": document_id
+        }
     except Exception as e:
+        logger.error(f"Error in 'generate_tags' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
 @mcp.tool()
+async def answer_question(question: str, context_filter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     """
+    Answer questions using RAG (Retrieval Augmented Generation) over indexed content.
+    'context_filter' can be used to narrow down the context search.
     """
+    logger.info(f"Tool 'answer_question' called with question: {question}, context_filter: {context_filter}")
+    try:
+        search_results = await search_tool_instance.search(question, top_k=5, filters=context_filter)
+        if not search_results:
+            return {
+                "success": False,
+                "error": "No relevant context found. Please upload relevant documents.",
+                "question": question,
+                "answer": "I could not find enough information in the documents to answer your question."
             }
+        answer = await generative_tool_instance.answer_question(question, search_results)
+        return {
+            "success": True,
+            "question": question,
+            "answer": answer,
+            "sources": [result.to_dict() for result in search_results],
+            "confidence": "high" if len(search_results) >= 3 else "medium"
+        }
+    except Exception as e:
+        logger.error(f"Error in 'answer_question' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
+@mcp.tool()
+async def list_documents_for_ui(limit: int = 100, offset: int = 0) -> Dict[str, Any]:
+    """
+    (UI Helper) List documents from the document store.
+    Not a standard processing tool, but useful for UI population.
+    """
+    logger.info(f"Tool 'list_documents_for_ui' called with limit: {limit}, offset: {offset}")
+    try:
+        documents = await document_store_service.list_documents(limit, offset)
+        return {
+            "success": True,
+            "documents": [doc.to_dict() for doc in documents],
+            "total": len(documents)
+        }
+    except Exception as e:
+        logger.error(f"Error in 'list_documents_for_ui' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e), "documents": []}
 if __name__ == "__main__":
+    logger.info("Starting FastMCP server...")
     asyncio.run(mcp.run())

mcp_tools.py DELETED Viewed

@@ -1,592 +0,0 @@
-import asyncio
-import aiohttp
-import chromadb
-from chromadb.utils import embedding_functions
-import json
-import logging
-from typing import Dict, List, Any, Optional
-from datetime import datetime
-import hashlib
-from pathlib import Path
-import requests
-# Document processing libraries (all free)
-import PyPDF2
-import docx
-from bs4 import BeautifulSoup
-import pandas as pd
-import markdown
-import xml.etree.ElementTree as ET
-from newspaper import Article
-import trafilatura
-from duckduckgo_search import DDGS
-# AI libraries
-from config import Config
-from mistralai.client import MistralClient
-import anthropic
-# Set up logging
-logger = logging.getLogger(__name__)
-# Initialize AI clients
-mistral_client = MistralClient(api_key=Config.MISTRAL_API_KEY) if Config.MISTRAL_API_KEY else None
-anthropic_client = anthropic.Anthropic(api_key=Config.ANTHROPIC_API_KEY) if Config.ANTHROPIC_API_KEY else None
-# Initialize ChromaDB
-chroma_client = chromadb.PersistentClient(path=Config.CHROMA_DB_PATH)
-embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
-    model_name=Config.EMBEDDING_MODEL
-)
-# Get or create collection
-try:
-    collection = chroma_client.get_collection(
-        name=Config.CHROMA_COLLECTION_NAME,
-        embedding_function=embedding_function
-    )
-except:
-    collection = chroma_client.create_collection(
-        name=Config.CHROMA_COLLECTION_NAME,
-        embedding_function=embedding_function
-    )
-class DocumentProcessor:
-    """Free document processing without Unstructured API"""
-    @staticmethod
-    def extract_text_from_pdf(file_path: str) -> str:
-        """Extract text from PDF files"""
-        text = ""
-        try:
-            with open(file_path, 'rb') as file:
-                pdf_reader = PyPDF2.PdfReader(file)
-                for page_num in range(len(pdf_reader.pages)):
-                    page = pdf_reader.pages[page_num]
-                    text += page.extract_text() + "\n"
-        except Exception as e:
-            logger.error(f"Error reading PDF: {e}")
-        return text
-    @staticmethod
-    def extract_text_from_docx(file_path: str) -> str:
-        """Extract text from DOCX files"""
-        try:
-            doc = docx.Document(file_path)
-            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
-            return text
-        except Exception as e:
-            logger.error(f"Error reading DOCX: {e}")
-            return ""
-    @staticmethod
-    def extract_text_from_html(file_path: str) -> str:
-        """Extract text from HTML files"""
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                soup = BeautifulSoup(file.read(), 'html.parser')
-                # Remove script and style elements
-                for script in soup(["script", "style"]):
-                    script.extract()
-                text = soup.get_text()
-                lines = (line.strip() for line in text.splitlines())
-                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-                text = '\n'.join(chunk for chunk in chunks if chunk)
-            return text
-        except Exception as e:
-            logger.error(f"Error reading HTML: {e}")
-            return ""
-    @staticmethod
-    def extract_text_from_txt(file_path: str) -> str:
-        """Extract text from TXT files"""
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                return file.read()
-        except Exception as e:
-            logger.error(f"Error reading TXT: {e}")
-            return ""
-    @staticmethod
-    def extract_text_from_csv(file_path: str) -> str:
-        """Extract text from CSV files"""
-        try:
-            df = pd.read_csv(file_path)
-            return df.to_string()
-        except Exception as e:
-            logger.error(f"Error reading CSV: {e}")
-            return ""
-    @staticmethod
-    def extract_text_from_json(file_path: str) -> str:
-        """Extract text from JSON files"""
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                data = json.load(file)
-                return json.dumps(data, indent=2)
-        except Exception as e:
-            logger.error(f"Error reading JSON: {e}")
-            return ""
-    @staticmethod
-    def extract_text_from_markdown(file_path: str) -> str:
-        """Extract text from Markdown files"""
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                md_text = file.read()
-                html = markdown.markdown(md_text)
-                soup = BeautifulSoup(html, 'html.parser')
-                return soup.get_text()
-        except Exception as e:
-            logger.error(f"Error reading Markdown: {e}")
-            return ""
-    @staticmethod
-    def extract_text_from_xml(file_path: str) -> str:
-        """Extract text from XML files"""
-        try:
-            tree = ET.parse(file_path)
-            root = tree.getroot()
-            def extract_text(element):
-                text = element.text or ""
-                for child in element:
-                    text += " " + extract_text(child)
-                return text.strip()
-            return extract_text(root)
-        except Exception as e:
-            logger.error(f"Error reading XML: {e}")
-            return ""
-    @classmethod
-    def extract_text(cls, file_path: str) -> str:
-        """Extract text from any supported file type"""
-        path = Path(file_path)
-        extension = path.suffix.lower()
-        extractors = {
-            '.pdf': cls.extract_text_from_pdf,
-            '.docx': cls.extract_text_from_docx,
-            '.doc': cls.extract_text_from_docx,
-            '.html': cls.extract_text_from_html,
-            '.htm': cls.extract_text_from_html,
-            '.txt': cls.extract_text_from_txt,
-            '.csv': cls.extract_text_from_csv,
-            '.json': cls.extract_text_from_json,
-            '.md': cls.extract_text_from_markdown,
-            '.xml': cls.extract_text_from_xml,
-        }
-        extractor = extractors.get(extension, cls.extract_text_from_txt)
-        return extractor(file_path)
-def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
-    """Split text into chunks with overlap"""
-    chunks = []
-    start = 0
-    text_length = len(text)
-    while start < text_length:
-        end = start + chunk_size
-        chunk = text[start:end]
-        # Try to find a sentence boundary
-        if end < text_length:
-            last_period = chunk.rfind('.')
-            last_newline = chunk.rfind('\n')
-            boundary = max(last_period, last_newline)
-            if boundary > chunk_size // 2:
-                chunk = text[start:start + boundary + 1]
-                end = start + boundary + 1
-        chunks.append(chunk.strip())
-        start = end - overlap
-    return chunks
-async def fetch_web_content_free(url: str) -> Optional[str]:
-    """Fetch content from URL using multiple free methods"""
-    # Method 1: Try newspaper3k (best for articles)
-    try:
-        article = Article(url)
-        article.download()
-        article.parse()
-        content = f"{article.title}\n\n{article.text}"
-        if len(content) > 100:  # Valid content
-            return content
-    except Exception as e:
-        logger.debug(f"Newspaper failed: {e}")
-    # Method 2: Try trafilatura (great for web scraping)
-    try:
-        downloaded = trafilatura.fetch_url(url)
-        content = trafilatura.extract(downloaded)
-        if content and len(content) > 100:
-            return content
-    except Exception as e:
-        logger.debug(f"Trafilatura failed: {e}")
-    # Method 3: Basic BeautifulSoup scraping
-    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-        }
-        response = requests.get(url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove unwanted elements
-            for element in soup(['script', 'style', 'nav', 'footer', 'header']):
-                element.decompose()
-            # Try to find main content
-            main_content = None
-            # Common content selectors
-            content_selectors = [
-                'main', 'article', '[role="main"]',
-                '.content', '#content', '.post', '.entry-content',
-                '.article-body', '.story-body'
-            ]
-            for selector in content_selectors:
-                main_content = soup.select_one(selector)
-                if main_content:
-                    break
-            if not main_content:
-                main_content = soup.find('body')
-            if main_content:
-                text = main_content.get_text(separator='\n', strip=True)
-                # Get title
-                title = soup.find('title')
-                title_text = title.get_text() if title else "No title"
-                return f"{title_text}\n\n{text}"
-    except Exception as e:
-        logger.error(f"BeautifulSoup failed: {e}")
-    return None
-async def search_web_free(query: str, num_results: int = 5) -> List[Dict[str, str]]:
-    """Search the web using free methods (DuckDuckGo)"""
-    try:
-        results = []
-        with DDGS() as ddgs:
-            for r in ddgs.text(query, max_results=num_results):
-                results.append({
-                    'title': r.get('title', ''),
-                    'url': r.get('link', ''),
-                    'snippet': r.get('body', '')
-                })
-        return results
-    except Exception as e:
-        logger.error(f"Search failed: {e}")
-        return []
-# In mcp_tools.py
-async def generate_tags(content: str) -> List[str]:
-    """Generate tags using Mistral AI or fallback to free method"""
-    try:
-        if mistral_client: # This is MistralClient from mistralai.client
-            prompt = f"""Analyze this content and generate 5-7 relevant tags.
-            Return only the tags as a comma-separated list.
-            Content: {content[:2000]}...
-            Tags:"""
-            # For mistralai==0.4.2, pass messages as a list of dicts
-            response = mistral_client.chat(
-                model=Config.MISTRAL_MODEL,
-                messages=[{"role": "user", "content": prompt}] # <--- CHANGE HERE
-            )
-            tags_text = response.choices[0].message.content.strip()
-            tags = [tag.strip() for tag in tags_text.split(",")]
-            return tags[:7]
-        else:
-            # Free fallback: Extract keywords using frequency analysis
-            return generate_tags_free(content)
-    except Exception as e:
-        logger.error(f"Error generating tags: {str(e)}")
-        return generate_tags_free(content)
-def generate_tags_free(content: str) -> List[str]:
-    """Free tag generation using keyword extraction"""
-    from collections import Counter
-    import re
-    # Simple keyword extraction
-    words = re.findall(r'\b[a-z]{4,}\b', content.lower())
-    # Common stop words
-    stop_words = {
-        'this', 'that', 'these', 'those', 'what', 'which', 'when', 'where',
-        'who', 'whom', 'whose', 'why', 'how', 'with', 'about', 'against',
-        'between', 'into', 'through', 'during', 'before', 'after', 'above',
-        'below', 'from', 'down', 'out', 'off', 'over', 'under', 'again',
-        'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
-        'how', 'all', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
-        'such', 'only', 'same', 'than', 'that', 'have', 'has', 'had',
-        'been', 'being', 'does', 'doing', 'will', 'would', 'could', 'should'
-    }
-    # Filter and count words
-    filtered_words = [w for w in words if w not in stop_words and len(w) > 4]
-    word_counts = Counter(filtered_words)
-    # Get top keywords
-    top_keywords = [word for word, _ in word_counts.most_common(7)]
-    return top_keywords if top_keywords else ["untagged"]
-async def generate_summary(content: str) -> str:
-    """Generate summary using Claude or fallback to free method"""
-    try:
-        if anthropic_client:
-            message = anthropic_client.messages.create(
-                model=Config.CLAUDE_MODEL,
-                max_tokens=300,
-                messages=[{
-                    "role": "user",
-                    "content": f"Summarize this content in 2-3 sentences:\n\n{content[:4000]}..."
-                }]
-            )
-            return message.content[0].text.strip()
-        else:
-            # Free fallback
-            return generate_summary_free(content)
-    except Exception as e:
-        logger.error(f"Error generating summary: {str(e)}")
-        return generate_summary_free(content)
-def generate_summary_free(content: str) -> str:
-    """Free summary generation using simple extraction"""
-    sentences = content.split('.')
-    # Take first 3 sentences
-    summary_sentences = sentences[:3]
-    summary = '. '.join(s.strip() for s in summary_sentences if s.strip())
-    if len(summary) > 300:
-        summary = summary[:297] + "..."
-    return summary if summary else "Content preview: " + content[:200] + "..."
-async def process_local_file(file_path: str) -> Dict[str, Any]:
-    """Process a local file and store it in the knowledge base"""
-    try:
-        # Validate file
-        path = Path(file_path)
-        if not path.exists():
-            raise FileNotFoundError(f"File not found: {file_path}")
-        if path.suffix.lower() not in Config.SUPPORTED_FILE_TYPES:
-            raise ValueError(f"Unsupported file type: {path.suffix}")
-        # Extract text using free methods
-        full_text = DocumentProcessor.extract_text(file_path)
-        if not full_text:
-            raise ValueError("No text could be extracted from the file")
-        # Generate document ID
-        doc_id = hashlib.md5(f"{path.name}_{datetime.now().isoformat()}".encode()).hexdigest()
-        # Generate tags
-        tags = await generate_tags(full_text[:3000])
-        # Generate summary
-        summary = await generate_summary(full_text[:5000])
-        # Chunk the text
-        chunks = chunk_text(full_text, chunk_size=1000, overlap=100)
-        chunks = chunks[:10]  # Limit chunks for demo
-        # Store in ChromaDB
-        chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
-        metadata = {
-            "source": str(path),
-            "file_name": path.name,
-            "file_type": path.suffix,
-            "processed_at": datetime.now().isoformat(),
-            "tags": ", ".join(tags),
-            "summary": summary,
-            "doc_id": doc_id
-        }
-        collection.add(
-            documents=chunks,
-            ids=chunk_ids,
-            metadatas=[metadata for _ in chunks]
-        )
-        return {
-            "success": True,
-            "doc_id": doc_id,
-            "file_name": path.name,
-            "tags": tags,
-            "summary": summary,
-            "chunks_processed": len(chunks),
-            "metadata": metadata
-        }
-    except Exception as e:
-        logger.error(f"Error processing file: {str(e)}")
-        return {
-            "success": False,
-            "error": str(e)
-        }
-async def process_web_content(url_or_query: str) -> Dict[str, Any]:
-    """Process web content from URL or search query"""
-    try:
-        # Check if it's a URL or search query
-        is_url = url_or_query.startswith(('http://', 'https://'))
-        if is_url:
-            content = await fetch_web_content_free(url_or_query)
-            source = url_or_query
-        else:
-            # It's a search query
-            search_results = await search_web_free(url_or_query, num_results=3)
-            if not search_results:
-                raise ValueError("No search results found")
-            # Process the first result
-            first_result = search_results[0]
-            content = await fetch_web_content_free(first_result['url'])
-            source = first_result['url']
-            # Add search context
-            content = f"Search Query: {url_or_query}\n\n{first_result['title']}\n\n{content}"
-        if not content:
-            raise ValueError("Failed to fetch content")
-        # Generate document ID
-        doc_id = hashlib.md5(f"{source}_{datetime.now().isoformat()}".encode()).hexdigest()
-        # Generate tags
-        tags = await generate_tags(content[:3000])
-        # Generate summary
-        summary = await generate_summary(content[:5000])
-        # Chunk the content
-        chunks = chunk_text(content, chunk_size=1000, overlap=100)
-        chunks = chunks[:10]  # Limit for demo
-        # Store in ChromaDB
-        chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
-        metadata = {
-            "source": source,
-            "url": source if is_url else f"Search: {url_or_query}",
-            "content_type": "web",
-            "processed_at": datetime.now().isoformat(),
-            "tags": ", ".join(tags),
-            "summary": summary,
-            "doc_id": doc_id
-        }
-        collection.add(
-            documents=chunks,
-            ids=chunk_ids,
-            metadatas=[metadata for _ in chunks]
-        )
-        return {
-            "success": True,
-            "doc_id": doc_id,
-            "url": source,
-            "tags": tags,
-            "summary": summary,
-            "chunks_processed": len(chunks),
-            "metadata": metadata,
-            "search_query": url_or_query if not is_url else None
-        }
-    except Exception as e:
-        logger.error(f"Error processing web content: {str(e)}")
-        return {
-            "success": False,
-            "error": str(e)
-        }
-async def search_knowledge_base(query: str, limit: int = 5) -> List[Dict[str, Any]]:
-    """Perform semantic search in the knowledge base"""
-    try:
-        results = collection.query(
-            query_texts=[query],
-            n_results=limit
-        )
-        if not results["ids"][0]:
-            return []
-        # Format results
-        formatted_results = []
-        seen_docs = set()
-        for i, doc_id in enumerate(results["ids"][0]):
-            metadata = results["metadatas"][0][i]
-            # Deduplicate by document
-            if metadata["doc_id"] not in seen_docs:
-                seen_docs.add(metadata["doc_id"])
-                formatted_results.append({
-                    "doc_id": metadata["doc_id"],
-                    "source": metadata.get("source", "Unknown"),
-                    "tags": metadata.get("tags", "").split(", "),
-                    "summary": metadata.get("summary", ""),
-                    "relevance_score": 1 - results["distances"][0][i],
-                    "processed_at": metadata.get("processed_at", "")
-                })
-        return formatted_results
-    except Exception as e:
-        logger.error(f"Error searching knowledge base: {str(e)}")
-        return []
-async def get_document_details(doc_id: str) -> Dict[str, Any]:
-    """Get detailed information about a document"""
-    try:
-        results = collection.get(
-            where={"doc_id": doc_id},
-            limit=1
-        )
-        if not results["ids"]:
-            return {"error": "Document not found"}
-        metadata = results["metadatas"][0]
-        return {
-            "doc_id": doc_id,
-            "source": metadata.get("source", "Unknown"),
-            "tags": metadata.get("tags", "").split(", "),
-            "summary": metadata.get("summary", ""),
-            "processed_at": metadata.get("processed_at", ""),
-            "file_type": metadata.get("file_type", ""),
-            "content_preview": results["documents"][0][:500] + "..."
-        }
-    except Exception as e:
-        logger.error(f"Error getting document details: {str(e)}")
-        return {"error": str(e)}

mcp_tools/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # MCP tools module initialization

mcp_tools/generative_tool.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import logging
+from typing import List, Dict, Any, Optional
+import asyncio
+from services.llm_service import LLMService
+from mcp_tools.search_tool import SearchTool
+from core.models import SearchResult
+logger = logging.getLogger(__name__)
+class GenerativeTool:
+    def __init__(self, llm_service: LLMService, search_tool: Optional[SearchTool] = None):
+        self.llm_service = llm_service
+        self.search_tool = search_tool
+    async def summarize(self, content: str, style: str = "concise", max_length: Optional[int] = None) -> str:
+        """Generate a summary of the given content"""
+        try:
+            if not content.strip():
+                return "No content provided for summarization."
+            logger.info(f"Generating {style} summary for content of length {len(content)}")
+            summary = await self.llm_service.summarize(content, style, max_length)
+            logger.info(f"Generated summary of length {len(summary)}")
+            return summary
+        except Exception as e:
+            logger.error(f"Error generating summary: {str(e)}")
+            return f"Error generating summary: {str(e)}"
+    async def generate_tags(self, content: str, max_tags: int = 5) -> List[str]:
+        """Generate relevant tags for the given content"""
+        try:
+            if not content.strip():
+                return []
+            logger.info(f"Generating up to {max_tags} tags for content")
+            tags = await self.llm_service.generate_tags(content, max_tags)
+            logger.info(f"Generated {len(tags)} tags")
+            return tags
+        except Exception as e:
+            logger.error(f"Error generating tags: {str(e)}")
+            return []
+    async def categorize(self, content: str, categories: List[str]) -> str:
+        """Categorize content into one of the provided categories"""
+        try:
+            if not content.strip():
+                return "Uncategorized"
+            if not categories:
+                categories = ["Technology", "Business", "Science", "Education", "Entertainment", "News", "Research", "Other"]
+            logger.info(f"Categorizing content into one of {len(categories)} categories")
+            category = await self.llm_service.categorize(content, categories)
+            logger.info(f"Categorized as: {category}")
+            return category
+        except Exception as e:
+            logger.error(f"Error categorizing content: {str(e)}")
+            return "Uncategorized"
+    async def answer_question(self, question: str, context_results: List[SearchResult] = None) -> str:
+        """Answer a question using the provided context or RAG"""
+        try:
+            if not question.strip():
+                return "No question provided."
+            logger.info(f"Answering question: {question[:100]}...")
+            # If no context provided and search tool is available, search for relevant context
+            if not context_results and self.search_tool:
+                logger.info("No context provided, searching for relevant information")
+                context_results = await self.search_tool.search(question, top_k=5)
+            # Prepare context from search results
+            if context_results:
+                context_texts = []
+                for result in context_results:
+                    context_texts.append(f"Source: {result.document_id}\nContent: {result.content}\n")
+                context = "\n---\n".join(context_texts)
+                logger.info(f"Using context from {len(context_results)} sources")
+            else:
+                context = ""
+                logger.info("No context available for answering question")
+            # Generate answer
+            answer = await self.llm_service.answer_question(question, context)
+            logger.info(f"Generated answer of length {len(answer)}")
+            return answer
+        except Exception as e:
+            logger.error(f"Error answering question: {str(e)}")
+            return f"I encountered an error while trying to answer your question: {str(e)}"
+    async def generate_outline(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> str:
+        """Generate an outline for the given topic"""
+        try:
+            if not topic.strip():
+                return "No topic provided."
+            detail_descriptions = {
+                "brief": "brief bullet points",
+                "medium": "detailed bullet points with descriptions",
+                "detailed": "comprehensive outline with sub-sections and explanations"
+            }
+            detail_desc = detail_descriptions.get(detail_level, "detailed bullet points")
+            prompt = f"""Create a {detail_desc} outline for the topic: "{topic}"
+            The outline should have {num_sections} main sections and be well-structured and informative.
+            Format the outline clearly with proper numbering and indentation.
+            Topic: {topic}
+            Outline:"""
+            outline = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.7)
+            logger.info(f"Generated outline for topic: {topic}")
+            return outline
+        except Exception as e:
+            logger.error(f"Error generating outline: {str(e)}")
+            return f"Error generating outline: {str(e)}"
+    async def explain_concept(self, concept: str, audience: str = "general", length: str = "medium") -> str:
+        """Explain a concept for a specific audience"""
+        try:
+            if not concept.strip():
+                return "No concept provided."
+            audience_styles = {
+                "general": "a general audience using simple, clear language",
+                "technical": "a technical audience with appropriate jargon and detail",
+                "beginner": "beginners with no prior knowledge, using analogies and examples",
+                "expert": "experts in the field with advanced terminology and depth"
+            }
+            length_guidance = {
+                "brief": "Keep the explanation concise and to the point (2-3 paragraphs).",
+                "medium": "Provide a comprehensive explanation (4-6 paragraphs).",
+                "detailed": "Give a thorough, in-depth explanation with examples."
+            }
+            audience_desc = audience_styles.get(audience, "a general audience")
+            length_desc = length_guidance.get(length, "Provide a comprehensive explanation.")
+            prompt = f"""Explain the concept of "{concept}" for {audience_desc}.
+            {length_desc}
+            Make sure to:
+            - Use appropriate language for the audience
+            - Include relevant examples or analogies
+            - Structure the explanation logically
+            - Ensure clarity and accuracy
+            Concept to explain: {concept}
+            Explanation:"""
+            explanation = await self.llm_service.generate_text(prompt, max_tokens=600, temperature=0.5)
+            logger.info(f"Generated explanation for concept: {concept}")
+            return explanation
+        except Exception as e:
+            logger.error(f"Error explaining concept: {str(e)}")
+            return f"Error explaining concept: {str(e)}"
+    async def compare_concepts(self, concept1: str, concept2: str, aspects: List[str] = None) -> str:
+        """Compare two concepts across specified aspects"""
+        try:
+            if not concept1.strip() or not concept2.strip():
+                return "Both concepts must be provided for comparison."
+            if not aspects:
+                aspects = ["definition", "key features", "advantages", "disadvantages", "use cases"]
+            aspects_str = ", ".join(aspects)
+            prompt = f"""Compare and contrast "{concept1}" and "{concept2}" across the following aspects: {aspects_str}.
+            Structure your comparison clearly, addressing each aspect for both concepts.
+            Format:
+            ## Comparison: {concept1} vs {concept2}
+            For each aspect, provide:
+            - **{concept1}**: [description]
+            - **{concept2}**: [description]
+            - **Key Difference**: [summary]
+            Concepts to compare:
+            1. {concept1}
+            2. {concept2}
+            Comparison:"""
+            comparison = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.6)
+            logger.info(f"Generated comparison between {concept1} and {concept2}")
+            return comparison
+        except Exception as e:
+            logger.error(f"Error comparing concepts: {str(e)}")
+            return f"Error comparing concepts: {str(e)}"
+    async def generate_questions(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> List[str]:
+        """Generate questions based on the provided content"""
+        try:
+            if not content.strip():
+                return []
+            question_types = {
+                "comprehension": "comprehension questions that test understanding of key concepts",
+                "analysis": "analytical questions that require deeper thinking and evaluation",
+                "application": "application questions that ask how to use the concepts in practice",
+                "creative": "creative questions that encourage original thinking and exploration",
+                "factual": "factual questions about specific details and information"
+            }
+            question_desc = question_types.get(question_type, "comprehension questions")
+            prompt = f"""Based on the following content, generate {num_questions} {question_desc}.
+            The questions should be:
+            - Clear and well-formulated
+            - Relevant to the content
+            - Appropriate for the specified type
+            - Engaging and thought-provoking
+            Content:
+            {content[:2000]}  # Limit content length
+            Questions:"""
+            response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.7)
+            # Parse questions from response
+            questions = []
+            lines = response.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and ('?' in line or line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*'))):
+                    # Clean up the question
+                    question = line.lstrip('0123456789.-* ').strip()
+                    if question and '?' in question:
+                        questions.append(question)
+            logger.info(f"Generated {len(questions)} {question_type} questions")
+            return questions[:num_questions]
+        except Exception as e:
+            logger.error(f"Error generating questions: {str(e)}")
+            return []
+    async def paraphrase_text(self, text: str, style: str = "formal", preserve_meaning: bool = True) -> str:
+        """Paraphrase text in a different style while preserving meaning"""
+        try:
+            if not text.strip():
+                return "No text provided for paraphrasing."
+            style_instructions = {
+                "formal": "formal, professional language",
+                "casual": "casual, conversational language",
+                "academic": "academic, scholarly language",
+                "simple": "simple, easy-to-understand language",
+                "technical": "technical, precise language"
+            }
+            style_desc = style_instructions.get(style, "clear, appropriate language")
+            meaning_instruction = "while preserving the exact meaning and key information" if preserve_meaning else "while maintaining the general intent"
+            prompt = f"""Paraphrase the following text using {style_desc} {meaning_instruction}.
+            Original text:
+            {text}
+            Paraphrased text:"""
+            paraphrase = await self.llm_service.generate_text(prompt, max_tokens=len(text.split()) * 2, temperature=0.6)
+            logger.info(f"Paraphrased text in {style} style")
+            return paraphrase.strip()
+        except Exception as e:
+            logger.error(f"Error paraphrasing text: {str(e)}")
+            return f"Error paraphrasing text: {str(e)}"
+    async def extract_key_insights(self, content: str, num_insights: int = 5) -> List[str]:
+        """Extract key insights from the provided content"""
+        try:
+            if not content.strip():
+                return []
+            prompt = f"""Analyze the following content and extract {num_insights} key insights or takeaways.
+            Each insight should be:
+            - A clear, concise statement
+            - Significant and meaningful
+            - Based on the content provided
+            - Actionable or thought-provoking when possible
+            Content:
+            {content[:3000]}  # Limit content length
+            Key Insights:"""
+            response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.6)
+            # Parse insights from response
+            insights = []
+            lines = response.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and (line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*')) or len(insights) == 0):
+                    # Clean up the insight
+                    insight = line.lstrip('0123456789.-* ').strip()
+                    if insight and len(insight) > 10:  # Minimum insight length
+                        insights.append(insight)
+            logger.info(f"Extracted {len(insights)} key insights")
+            return insights[:num_insights]
+        except Exception as e:
+            logger.error(f"Error extracting insights: {str(e)}")
+            return []

mcp_tools/ingestion_tool.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import logging
+import asyncio
+from typing import Dict, Any, Optional
+import tempfile
+import os
+from pathlib import Path
+import uuid
+from core.document_parser import DocumentParser
+from core.chunker import TextChunker
+from core.text_preprocessor import TextPreprocessor
+from services.vector_store_service import VectorStoreService
+from services.document_store_service import DocumentStoreService
+from services.embedding_service import EmbeddingService
+from services.ocr_service import OCRService
+logger = logging.getLogger(__name__)
+class IngestionTool:
+    def __init__(self, vector_store: VectorStoreService, document_store: DocumentStoreService,
+             embedding_service: EmbeddingService, ocr_service: OCRService):
+        self.vector_store = vector_store
+        self.document_store = document_store
+        self.embedding_service = embedding_service
+        self.ocr_service = ocr_service
+        self.document_parser = DocumentParser()
+        # Pass OCR service to document parser
+        self.document_parser.ocr_service = ocr_service
+        self.text_chunker = TextChunker()
+        self.text_preprocessor = TextPreprocessor()
+    async def process_document(self, file_path: str, file_type: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process a document through the full ingestion pipeline"""
+        if task_id is None:
+            task_id = str(uuid.uuid4())
+        try:
+            logger.info(f"Starting document processing for {file_path}")
+            # Step 1: Parse the document
+            filename = Path(file_path).name
+            document = await self.document_parser.parse_document(file_path, filename)
+            if not document.content:
+                logger.warning(f"No content extracted from document {filename}")
+                return {
+                    "success": False,
+                    "error": "No content could be extracted from the document",
+                    "task_id": task_id
+                }
+            # Step 2: Store the document
+            await self.document_store.store_document(document)
+            # Step 3: Process content for embeddings
+            chunks = await self._create_and_embed_chunks(document)
+            if not chunks:
+                logger.warning(f"No chunks created for document {document.id}")
+                return {
+                    "success": False,
+                    "error": "Failed to create text chunks",
+                    "task_id": task_id,
+                    "document_id": document.id
+                }
+            # Step 4: Store embeddings
+            success = await self.vector_store.add_chunks(chunks)
+            if not success:
+                logger.error(f"Failed to store embeddings for document {document.id}")
+                return {
+                    "success": False,
+                    "error": "Failed to store embeddings",
+                    "task_id": task_id,
+                    "document_id": document.id
+                }
+            logger.info(f"Successfully processed document {document.id} with {len(chunks)} chunks")
+            return {
+                "success": True,
+                "task_id": task_id,
+                "document_id": document.id,
+                "filename": document.filename,
+                "chunks_created": len(chunks),
+                "content_length": len(document.content),
+                "doc_type": document.doc_type.value,
+                "message": f"Successfully processed {filename}"
+            }
+        except Exception as e:
+            logger.error(f"Error processing document {file_path}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id,
+                "message": f"Failed to process document: {str(e)}"
+            }
+    async def _create_and_embed_chunks(self, document) -> list:
+        """Create chunks and generate embeddings"""
+        try:
+            # Step 1: Create chunks
+            chunks = self.text_chunker.chunk_document(
+                document.id,
+                document.content,
+                method="recursive"
+            )
+            if not chunks:
+                return []
+            # Step 2: Optimize chunks for embedding
+            optimized_chunks = self.text_chunker.optimize_chunks_for_embedding(chunks)
+            # Step 3: Generate embeddings
+            texts = [chunk.content for chunk in optimized_chunks]
+            embeddings = await self.embedding_service.generate_embeddings(texts)
+            # Step 4: Add embeddings to chunks
+            embedded_chunks = []
+            for i, chunk in enumerate(optimized_chunks):
+                if i < len(embeddings):
+                    chunk.embedding = embeddings[i]
+                    embedded_chunks.append(chunk)
+            return embedded_chunks
+        except Exception as e:
+            logger.error(f"Error creating and embedding chunks: {str(e)}")
+            return []
+    async def process_url(self, url: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process a document from a URL"""
+        try:
+            import requests
+            from urllib.parse import urlparse
+            # Download the file
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            # Determine file type from URL or content-type
+            parsed_url = urlparse(url)
+            filename = Path(parsed_url.path).name or "downloaded_file"
+            # Create temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
+                tmp_file.write(response.content)
+                tmp_file_path = tmp_file.name
+            try:
+                # Process the downloaded file
+                result = await self.process_document(tmp_file_path, "", task_id)
+                result["source_url"] = url
+                return result
+            finally:
+                # Clean up temporary file
+                if os.path.exists(tmp_file_path):
+                    os.unlink(tmp_file_path)
+        except Exception as e:
+            logger.error(f"Error processing URL {url}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4()),
+                "source_url": url
+            }
+    async def process_text_content(self, content: str, filename: str = "text_content.txt",
+                                 task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process raw text content directly"""
+        try:
+            from core.models import Document, DocumentType
+            from datetime import datetime
+            # Create document object
+            document = Document(
+                id=str(uuid.uuid4()),
+                filename=filename,
+                content=content,
+                doc_type=DocumentType.TEXT,
+                file_size=len(content.encode('utf-8')),
+                created_at=datetime.utcnow(),
+                metadata={
+                    "source": "direct_text_input",
+                    "content_length": len(content),
+                    "word_count": len(content.split())
+                }
+            )
+            # Store the document
+            await self.document_store.store_document(document)
+            # Process content for embeddings
+            chunks = await self._create_and_embed_chunks(document)
+            if chunks:
+                await self.vector_store.add_chunks(chunks)
+            return {
+                "success": True,
+                "task_id": task_id or str(uuid.uuid4()),
+                "document_id": document.id,
+                "filename": filename,
+                "chunks_created": len(chunks),
+                "content_length": len(content),
+                "message": f"Successfully processed text content"
+            }
+        except Exception as e:
+            logger.error(f"Error processing text content: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4())
+            }
+    async def reprocess_document(self, document_id: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Reprocess an existing document (useful for updating embeddings)"""
+        try:
+            # Get the document
+            document = await self.document_store.get_document(document_id)
+            if not document:
+                return {
+                    "success": False,
+                    "error": f"Document {document_id} not found",
+                    "task_id": task_id or str(uuid.uuid4())
+                }
+            # Remove existing chunks from vector store
+            await self.vector_store.delete_document(document_id)
+            # Recreate and embed chunks
+            chunks = await self._create_and_embed_chunks(document)
+            if chunks:
+                await self.vector_store.add_chunks(chunks)
+            return {
+                "success": True,
+                "task_id": task_id or str(uuid.uuid4()),
+                "document_id": document_id,
+                "filename": document.filename,
+                "chunks_created": len(chunks),
+                "message": f"Successfully reprocessed {document.filename}"
+            }
+        except Exception as e:
+            logger.error(f"Error reprocessing document {document_id}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4()),
+                "document_id": document_id
+            }
+    async def batch_process_directory(self, directory_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process multiple documents from a directory"""
+        try:
+            directory = Path(directory_path)
+            if not directory.exists() or not directory.is_dir():
+                return {
+                    "success": False,
+                    "error": f"Directory {directory_path} does not exist",
+                    "task_id": task_id or str(uuid.uuid4())
+                }
+            # Supported file extensions
+            supported_extensions = {'.txt', '.pdf', '.docx', '.png', '.jpg', '.jpeg', '.bmp', '.tiff'}
+            # Find all supported files
+            files_to_process = []
+            for ext in supported_extensions:
+                files_to_process.extend(directory.glob(f"*{ext}"))
+                files_to_process.extend(directory.glob(f"*{ext.upper()}"))
+            if not files_to_process:
+                return {
+                    "success": False,
+                    "error": "No supported files found in directory",
+                    "task_id": task_id or str(uuid.uuid4())
+                }
+            # Process files
+            results = []
+            successful = 0
+            failed = 0
+            for file_path in files_to_process:
+                try:
+                    result = await self.process_document(str(file_path), file_path.suffix)
+                    results.append(result)
+                    if result.get("success"):
+                        successful += 1
+                    else:
+                        failed += 1
+                except Exception as e:
+                    failed += 1
+                    results.append({
+                        "success": False,
+                        "error": str(e),
+                        "filename": file_path.name
+                    })
+            return {
+                "success": True,
+                "task_id": task_id or str(uuid.uuid4()),
+                "directory": str(directory),
+                "total_files": len(files_to_process),
+                "successful": successful,
+                "failed": failed,
+                "results": results,
+                "message": f"Processed {successful}/{len(files_to_process)} files successfully"
+            }
+        except Exception as e:
+            logger.error(f"Error batch processing directory {directory_path}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4())
+            }

mcp_tools/search_tool.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import logging
+from typing import List, Dict, Any, Optional
+import asyncio
+from core.models import SearchResult
+from services.vector_store_service import VectorStoreService
+from services.embedding_service import EmbeddingService
+from services.document_store_service import DocumentStoreService
+import config
+logger = logging.getLogger(__name__)
+class SearchTool:
+    def __init__(self, vector_store: VectorStoreService, embedding_service: EmbeddingService,
+                 document_store: Optional[DocumentStoreService] = None):
+        self.vector_store = vector_store
+        self.embedding_service = embedding_service
+        self.document_store = document_store
+        self.config = config.config
+    async def search(self, query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None,
+                    similarity_threshold: Optional[float] = None) -> List[SearchResult]:
+        """Perform semantic search"""
+        try:
+            if not query.strip():
+                logger.warning("Empty search query provided")
+                return []
+            # Use default threshold if not provided
+            if similarity_threshold is None:
+                similarity_threshold = self.config.SIMILARITY_THRESHOLD
+            logger.info(f"Performing semantic search for: '{query}' (top_k={top_k})")
+            # Generate query embedding
+            query_embedding = await self.embedding_service.generate_single_embedding(query)
+            if not query_embedding:
+                logger.error("Failed to generate query embedding")
+                return []
+            # Perform vector search
+            results = await self.vector_store.search(
+                query_embedding=query_embedding,
+                top_k=top_k,
+                filters=filters
+            )
+            # Filter by similarity threshold
+            filtered_results = [
+                result for result in results
+                if result.score >= similarity_threshold
+            ]
+            logger.info(f"Found {len(filtered_results)} results above threshold {similarity_threshold}")
+            # Enhance results with additional metadata if document store is available
+            if self.document_store:
+                enhanced_results = await self._enhance_results_with_metadata(filtered_results)
+                return enhanced_results
+            return filtered_results
+        except Exception as e:
+            logger.error(f"Error performing semantic search: {str(e)}")
+            return []
+    async def _enhance_results_with_metadata(self, results: List[SearchResult]) -> List[SearchResult]:
+        """Enhance search results with document metadata"""
+        try:
+            enhanced_results = []
+            for result in results:
+                try:
+                    # Get document metadata
+                    document = await self.document_store.get_document(result.document_id)
+                    if document:
+                        # Add document metadata to result
+                        enhanced_metadata = {
+                            **result.metadata,
+                            "document_filename": document.filename,
+                            "document_type": document.doc_type.value,
+                            "document_tags": document.tags,
+                            "document_category": document.category,
+                            "document_created_at": document.created_at.isoformat(),
+                            "document_summary": document.summary
+                        }
+                        enhanced_result = SearchResult(
+                            chunk_id=result.chunk_id,
+                            document_id=result.document_id,
+                            content=result.content,
+                            score=result.score,
+                            metadata=enhanced_metadata
+                        )
+                        enhanced_results.append(enhanced_result)
+                    else:
+                        # Document not found, use original result
+                        enhanced_results.append(result)
+                except Exception as e:
+                    logger.warning(f"Error enhancing result {result.chunk_id}: {str(e)}")
+                    enhanced_results.append(result)
+            return enhanced_results
+        except Exception as e:
+            logger.error(f"Error enhancing results: {str(e)}")
+            return results
+    async def multi_query_search(self, queries: List[str], top_k: int = 5,
+                               aggregate_method: str = "merge") -> List[SearchResult]:
+        """Perform search with multiple queries and aggregate results"""
+        try:
+            all_results = []
+            # Perform search for each query
+            for query in queries:
+                if query.strip():
+                    query_results = await self.search(query, top_k)
+                    all_results.extend(query_results)
+            if not all_results:
+                return []
+            # Aggregate results
+            if aggregate_method == "merge":
+                return await self._merge_results(all_results, top_k)
+            elif aggregate_method == "intersect":
+                return await self._intersect_results(all_results, top_k)
+            elif aggregate_method == "average":
+                return await self._average_results(all_results, top_k)
+            else:
+                # Default to merge
+                return await self._merge_results(all_results, top_k)
+        except Exception as e:
+            logger.error(f"Error in multi-query search: {str(e)}")
+            return []
+    async def _merge_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
+        """Merge results and remove duplicates, keeping highest scores"""
+        try:
+            # Group by chunk_id and keep highest score
+            chunk_scores = {}
+            chunk_results = {}
+            for result in results:
+                chunk_id = result.chunk_id
+                if chunk_id not in chunk_scores or result.score > chunk_scores[chunk_id]:
+                    chunk_scores[chunk_id] = result.score
+                    chunk_results[chunk_id] = result
+            # Sort by score and return top_k
+            merged_results = list(chunk_results.values())
+            merged_results.sort(key=lambda x: x.score, reverse=True)
+            return merged_results[:top_k]
+        except Exception as e:
+            logger.error(f"Error merging results: {str(e)}")
+            return results[:top_k]
+    async def _intersect_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
+        """Find chunks that appear in multiple queries"""
+        try:
+            # Count occurrences of each chunk
+            chunk_counts = {}
+            chunk_results = {}
+            for result in results:
+                chunk_id = result.chunk_id
+                chunk_counts[chunk_id] = chunk_counts.get(chunk_id, 0) + 1
+                if chunk_id not in chunk_results or result.score > chunk_results[chunk_id].score:
+                    chunk_results[chunk_id] = result
+            # Filter chunks that appear more than once
+            intersect_results = [
+                result for chunk_id, result in chunk_results.items()
+                if chunk_counts[chunk_id] > 1
+            ]
+            # Sort by score
+            intersect_results.sort(key=lambda x: x.score, reverse=True)
+            return intersect_results[:top_k]
+        except Exception as e:
+            logger.error(f"Error intersecting results: {str(e)}")
+            return []
+    async def _average_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
+        """Average scores for chunks that appear multiple times"""
+        try:
+            # Group by chunk_id and calculate average scores
+            chunk_groups = {}
+            for result in results:
+                chunk_id = result.chunk_id
+                if chunk_id not in chunk_groups:
+                    chunk_groups[chunk_id] = []
+                chunk_groups[chunk_id].append(result)
+            # Calculate average scores
+            averaged_results = []
+            for chunk_id, group in chunk_groups.items():
+                avg_score = sum(r.score for r in group) / len(group)
+                # Use the result with the highest individual score but update the score to average
+                best_result = max(group, key=lambda x: x.score)
+                averaged_result = SearchResult(
+                    chunk_id=best_result.chunk_id,
+                    document_id=best_result.document_id,
+                    content=best_result.content,
+                    score=avg_score,
+                    metadata={
+                        **best_result.metadata,
+                        "query_count": len(group),
+                        "score_range": f"{min(r.score for r in group):.3f}-{max(r.score for r in group):.3f}"
+                    }
+                )
+                averaged_results.append(averaged_result)
+            # Sort by average score
+            averaged_results.sort(key=lambda x: x.score, reverse=True)
+            return averaged_results[:top_k]
+        except Exception as e:
+            logger.error(f"Error averaging results: {str(e)}")
+            return results[:top_k]
+    async def search_by_document(self, document_id: str, query: str, top_k: int = 5) -> List[SearchResult]:
+        """Search within a specific document"""
+        try:
+            filters = {"document_id": document_id}
+            return await self.search(query, top_k, filters)
+        except Exception as e:
+            logger.error(f"Error searching within document {document_id}: {str(e)}")
+            return []
+    async def search_by_category(self, category: str, query: str, top_k: int = 5) -> List[SearchResult]:
+        """Search within documents of a specific category"""
+        try:
+            if not self.document_store:
+                logger.warning("Document store not available for category search")
+                return await self.search(query, top_k)
+            # Get documents in the category
+            documents = await self.document_store.list_documents(
+                limit=1000,  # Adjust as needed
+                filters={"category": category}
+            )
+            if not documents:
+                logger.info(f"No documents found in category '{category}'")
+                return []
+            # Extract document IDs
+            document_ids = [doc.id for doc in documents]
+            # Search with document ID filter
+            filters = {"document_ids": document_ids}
+            return await self.search(query, top_k, filters)
+        except Exception as e:
+            logger.error(f"Error searching by category {category}: {str(e)}")
+            return []
+    async def search_with_date_range(self, query: str, start_date, end_date, top_k: int = 5) -> List[SearchResult]:
+        """Search documents within a date range"""
+        try:
+            if not self.document_store:
+                logger.warning("Document store not available for date range search")
+                return await self.search(query, top_k)
+            # Get documents in the date range
+            documents = await self.document_store.list_documents(
+                limit=1000,  # Adjust as needed
+                filters={
+                    "created_after": start_date,
+                    "created_before": end_date
+                }
+            )
+            if not documents:
+                logger.info(f"No documents found in date range")
+                return []
+            # Extract document IDs
+            document_ids = [doc.id for doc in documents]
+            # Search with document ID filter
+            filters = {"document_ids": document_ids}
+            return await self.search(query, top_k, filters)
+        except Exception as e:
+            logger.error(f"Error searching with date range: {str(e)}")
+            return []
+    async def get_search_suggestions(self, partial_query: str, limit: int = 5) -> List[str]:
+        """Get search suggestions based on partial query"""
+        try:
+            # This is a simple implementation
+            # In a production system, you might want to use a more sophisticated approach
+            if len(partial_query) < 2:
+                return []
+            # Search for the partial query
+            results = await self.search(partial_query, top_k=20)
+            # Extract potential query expansions from content
+            suggestions = set()
+            for result in results:
+                content_words = result.content.lower().split()
+                for i, word in enumerate(content_words):
+                    if partial_query.lower() in word:
+                        # Add the word itself
+                        suggestions.add(word.strip('.,!?;:'))
+                        # Add phrases that include this word
+                        if i > 0:
+                            phrase = f"{content_words[i-1]} {word}".strip('.,!?;:')
+                            suggestions.add(phrase)
+                        if i < len(content_words) - 1:
+                            phrase = f"{word} {content_words[i+1]}".strip('.,!?;:')
+                            suggestions.add(phrase)
+            # Filter and sort suggestions
+            filtered_suggestions = [
+                s for s in suggestions
+                if len(s) > len(partial_query) and s.startswith(partial_query.lower())
+            ]
+            return sorted(filtered_suggestions)[:limit]
+        except Exception as e:
+            logger.error(f"Error getting search suggestions: {str(e)}")
+            return []
+    async def explain_search(self, query: str, top_k: int = 3) -> Dict[str, Any]:
+        """Provide detailed explanation of search process and results"""
+        try:
+            explanation = {
+                "query": query,
+                "steps": [],
+                "results_analysis": {},
+                "performance_metrics": {}
+            }
+            # Step 1: Query processing
+            explanation["steps"].append({
+                "step": "query_processing",
+                "description": "Processing and normalizing the search query",
+                "details": {
+                    "original_query": query,
+                    "cleaned_query": query.strip(),
+                    "query_length": len(query)
+                }
+            })
+            # Step 2: Embedding generation
+            import time
+            start_time = time.time()
+            query_embedding = await self.embedding_service.generate_single_embedding(query)
+            embedding_time = time.time() - start_time
+            explanation["steps"].append({
+                "step": "embedding_generation",
+                "description": "Converting query to vector embedding",
+                "details": {
+                    "embedding_dimension": len(query_embedding) if query_embedding else 0,
+                    "generation_time_ms": round(embedding_time * 1000, 2)
+                }
+            })
+            # Step 3: Vector search
+            start_time = time.time()
+            results = await self.vector_store.search(query_embedding, top_k)
+            search_time = time.time() - start_time
+            explanation["steps"].append({
+                "step": "vector_search",
+                "description": "Searching vector database for similar content",
+                "details": {
+                    "search_time_ms": round(search_time * 1000, 2),
+                    "results_found": len(results),
+                    "top_score": results[0].score if results else 0,
+                    "score_range": f"{min(r.score for r in results):.3f}-{max(r.score for r in results):.3f}" if results else "N/A"
+                }
+            })
+            # Results analysis
+            if results:
+                explanation["results_analysis"] = {
+                    "total_results": len(results),
+                    "average_score": sum(r.score for r in results) / len(results),
+                    "unique_documents": len(set(r.document_id for r in results)),
+                    "content_lengths": [len(r.content) for r in results]
+                }
+            # Performance metrics
+            explanation["performance_metrics"] = {
+                "total_time_ms": round((embedding_time + search_time) * 1000, 2),
+                "embedding_time_ms": round(embedding_time * 1000, 2),
+                "search_time_ms": round(search_time * 1000, 2)
+            }
+            return explanation
+        except Exception as e:
+            logger.error(f"Error explaining search: {str(e)}")
+            return {"error": str(e)}

mcp_tools/utils.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import logging
+import asyncio
+import functools
+from typing import Any, Callable, Dict, List, Optional
+import time
+import json
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def async_timer(func: Callable) -> Callable:
+    """Decorator to time async function execution"""
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        start_time = time.time()
+        try:
+            result = await func(*args, **kwargs)
+            end_time = time.time()
+            logger.debug(f"{func.__name__} completed in {end_time - start_time:.3f}s")
+            return result
+        except Exception as e:
+            end_time = time.time()
+            logger.error(f"{func.__name__} failed after {end_time - start_time:.3f}s: {str(e)}")
+            raise
+    return wrapper
+def retry_async(max_attempts: int = 3, delay: float = 1.0, backoff: float = 2.0):
+    """Decorator to retry async functions with exponential backoff"""
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        async def wrapper(*args, **kwargs):
+            attempt = 1
+            current_delay = delay
+            while attempt <= max_attempts:
+                try:
+                    return await func(*args, **kwargs)
+                except Exception as e:
+                    if attempt == max_attempts:
+                        logger.error(f"{func.__name__} failed after {max_attempts} attempts: {str(e)}")
+                        raise
+                    logger.warning(f"{func.__name__} attempt {attempt} failed: {str(e)}")
+                    logger.info(f"Retrying in {current_delay}s...")
+                    await asyncio.sleep(current_delay)
+                    attempt += 1
+                    current_delay *= backoff
+        return wrapper
+    return decorator
+class MCPToolResponse:
+    """Standardized response format for MCP tools"""
+    def __init__(self, success: bool, data: Any = None, error: str = None,
+                 metadata: Dict[str, Any] = None):
+        self.success = success
+        self.data = data
+        self.error = error
+        self.metadata = metadata or {}
+        self.timestamp = time.time()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert response to dictionary"""
+        result = {
+            "success": self.success,
+            "timestamp": self.timestamp
+        }
+        if self.success:
+            result["data"] = self.data
+        else:
+            result["error"] = self.error
+        if self.metadata:
+            result["metadata"] = self.metadata
+        return result
+    @classmethod
+    def success_response(cls, data: Any, metadata: Dict[str, Any] = None):
+        """Create a success response"""
+        return cls(success=True, data=data, metadata=metadata)
+    @classmethod
+    def error_response(cls, error: str, metadata: Dict[str, Any] = None):
+        """Create an error response"""
+        return cls(success=False, error=error, metadata=metadata)
+def validate_required_params(params: Dict[str, Any], required: List[str]) -> Optional[str]:
+    """Validate that required parameters are present"""
+    missing = []
+    for param in required:
+        if param not in params or params[param] is None:
+            missing.append(param)
+    if missing:
+        return f"Missing required parameters: {', '.join(missing)}"
+    return None
+def sanitize_filename(filename: str) -> str:
+    """Sanitize filename for safe storage"""
+    import re
+    # Remove or replace invalid characters
+    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
+    # Remove leading/trailing dots and spaces
+    filename = filename.strip('. ')
+    # Limit length
+    if len(filename) > 255:
+        name, ext = Path(filename).stem, Path(filename).suffix
+        max_name_len = 255 - len(ext)
+        filename = name[:max_name_len] + ext
+    # Ensure not empty
+    if not filename:
+        filename = "unnamed_file"
+    return filename
+def truncate_text(text: str, max_length: int, add_ellipsis: bool = True) -> str:
+    """Truncate text to specified length"""
+    if len(text) <= max_length:
+        return text
+    if add_ellipsis and max_length > 3:
+        return text[:max_length - 3] + "..."
+    else:
+        return text[:max_length]
+def extract_file_info(file_path: str) -> Dict[str, Any]:
+    """Extract information about a file"""
+    try:
+        path = Path(file_path)
+        stat = path.stat()
+        return {
+            "filename": path.name,
+            "extension": path.suffix.lower(),
+            "size_bytes": stat.st_size,
+            "size_mb": round(stat.st_size / (1024 * 1024), 2),
+            "created_time": stat.st_ctime,
+            "modified_time": stat.st_mtime,
+            "exists": path.exists(),
+            "is_file": path.is_file(),
+            "is_dir": path.is_dir()
+        }
+    except Exception as e:
+        return {"error": str(e)}
+async def batch_process(items: List[Any], processor: Callable, batch_size: int = 10,
+                       max_concurrent: int = 5) -> List[Any]:
+    """Process items in batches with concurrency control"""
+    results = []
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def process_item(item):
+        async with semaphore:
+            return await processor(item)
+    # Process in batches
+    for i in range(0, len(items), batch_size):
+        batch = items[i:i + batch_size]
+        batch_tasks = [process_item(item) for item in batch]
+        batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
+        results.extend(batch_results)
+    return results
+def format_file_size(size_bytes: int) -> str:
+    """Format file size in human-readable format"""
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f} PB"
+def calculate_reading_time(text: str, words_per_minute: int = 200) -> int:
+    """Calculate estimated reading time in minutes"""
+    word_count = len(text.split())
+    return max(1, round(word_count / words_per_minute))
+class ProgressTracker:
+    """Track progress of long-running operations"""
+    def __init__(self, total_items: int, description: str = "Processing"):
+        self.total_items = total_items
+        self.completed_items = 0
+        self.description = description
+        self.start_time = time.time()
+        self.errors = []
+    def update(self, completed: int = 1, error: str = None):
+        """Update progress"""
+        self.completed_items += completed
+        if error:
+            self.errors.append(error)
+    def get_progress(self) -> Dict[str, Any]:
+        """Get current progress information"""
+        elapsed_time = time.time() - self.start_time
+        progress_percent = (self.completed_items / self.total_items) * 100 if self.total_items > 0 else 0
+        # Estimate remaining time
+        if self.completed_items > 0:
+            avg_time_per_item = elapsed_time / self.completed_items
+            remaining_items = self.total_items - self.completed_items
+            estimated_remaining_time = avg_time_per_item * remaining_items
+        else:
+            estimated_remaining_time = 0
+        return {
+            "description": self.description,
+            "total_items": self.total_items,
+            "completed_items": self.completed_items,
+            "progress_percent": round(progress_percent, 1),
+            "elapsed_time_seconds": round(elapsed_time, 1),
+            "estimated_remaining_seconds": round(estimated_remaining_time, 1),
+            "errors_count": len(self.errors),
+            "errors": self.errors[-5:] if self.errors else []  # Last 5 errors
+        }
+    def is_complete(self) -> bool:
+        """Check if processing is complete"""
+        return self.completed_items >= self.total_items
+def load_json_config(config_path: str, default_config: Dict[str, Any] = None) -> Dict[str, Any]:
+    """Load configuration from JSON file with fallback to defaults"""
+    try:
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        logger.info(f"Loaded configuration from {config_path}")
+        return config
+    except FileNotFoundError:
+        logger.warning(f"Configuration file {config_path} not found, using defaults")
+        return default_config or {}
+    except json.JSONDecodeError as e:
+        logger.error(f"Invalid JSON in configuration file {config_path}: {str(e)}")
+        return default_config or {}
+def save_json_config(config: Dict[str, Any], config_path: str) -> bool:
+    """Save configuration to JSON file"""
+    try:
+        # Create directory if it doesn't exist
+        Path(config_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(config_path, 'w') as f:
+            json.dump(config, f, indent=2)
+        logger.info(f"Saved configuration to {config_path}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to save configuration to {config_path}: {str(e)}")
+        return False
+class RateLimiter:
+    """Simple rate limiter for API calls"""
+    def __init__(self, max_calls: int, time_window: float):
+        self.max_calls = max_calls
+        self.time_window = time_window
+        self.calls = []
+    async def acquire(self):
+        """Acquire permission to make a call"""
+        now = time.time()
+        # Remove old calls outside the time window
+        self.calls = [call_time for call_time in self.calls if now - call_time < self.time_window]
+        # Check if we can make a new call
+        if len(self.calls) >= self.max_calls:
+            # Wait until we can make a call
+            oldest_call = min(self.calls)
+            wait_time = self.time_window - (now - oldest_call)
+            if wait_time > 0:
+                await asyncio.sleep(wait_time)
+                return await self.acquire()  # Recursive call after waiting
+        # Record this call
+        self.calls.append(now)
+def escape_markdown(text: str) -> str:
+    """Escape markdown special characters"""
+    import re
+    # Characters that need escaping in markdown
+    markdown_chars = r'([*_`\[\]()#+\-!\\])'
+    return re.sub(markdown_chars, r'\\\1', text)
+def create_error_summary(errors: List[Exception]) -> str:
+    """Create a summary of multiple errors"""
+    if not errors:
+        return "No errors"
+    error_counts = {}
+    for error in errors:
+        error_type = type(error).__name__
+        error_counts[error_type] = error_counts.get(error_type, 0) + 1
+    summary_parts = []
+    for error_type, count in error_counts.items():
+        if count == 1:
+            summary_parts.append(f"1 {error_type}")
+        else:
+            summary_parts.append(f"{count} {error_type}s")
+    return f"Encountered {len(errors)} total errors: " + ", ".join(summary_parts)
+async def safe_execute(func: Callable, *args, default_return=None, **kwargs):
+    """Safely execute a function and return default on error"""
+    try:
+        if asyncio.iscoroutinefunction(func):
+            return await func(*args, **kwargs)
+        else:
+            return func(*args, **kwargs)
+    except Exception as e:
+        logger.error(f"Error executing {func.__name__}: {str(e)}")
+        return default_return
+def get_content_preview(content: str, max_length: int = 200) -> str:
+    """Get a preview of content for display"""
+    if not content:
+        return "No content"
+    # Clean up whitespace
+    content = ' '.join(content.split())
+    if len(content) <= max_length:
+        return content
+    # Try to break at sentence boundary
+    preview = content[:max_length]
+    last_sentence_end = max(preview.rfind('.'), preview.rfind('!'), preview.rfind('?'))
+    if last_sentence_end > max_length * 0.7:  # If we found a good breaking point
+        return preview[:last_sentence_end + 1]
+    else:
+        # Break at word boundary
+        last_space = preview.rfind(' ')
+        if last_space > max_length * 0.7:
+            return preview[:last_space] + "..."
+        else:
+            return preview + "..."
+class MemoryUsageTracker:
+    """Track memory usage of operations"""
+    def __init__(self):
+        self.start_memory = self._get_memory_usage()
+    def _get_memory_usage(self) -> float:
+        """Get current memory usage in MB"""
+        try:
+            import psutil
+            process = psutil.Process()
+            return process.memory_info().rss / 1024 / 1024  # Convert to MB
+        except ImportError:
+            return 0.0
+    def get_usage_delta(self) -> float:
+        """Get memory usage change since initialization"""
+        current_memory = self._get_memory_usage()
+        return current_memory - self.start_memory
+    def log_usage(self, operation_name: str):
+        """Log current memory usage for an operation"""
+        delta = self.get_usage_delta()
+        logger.info(f"{operation_name} memory delta: {delta:.1f} MB")

services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Services module initialization

services/document_store_service.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import logging
+import json
+import os
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+import pickle
+from datetime import datetime
+import asyncio
+from core.models import Document, DocumentType
+import config
+logger = logging.getLogger(__name__)
+class DocumentStoreService:
+    def __init__(self):
+        self.config = config.config
+        self.store_path = Path(self.config.DOCUMENT_STORE_PATH)
+        self.store_path.mkdir(parents=True, exist_ok=True)
+        # Separate paths for metadata and content
+        self.metadata_path = self.store_path / "metadata"
+        self.content_path = self.store_path / "content"
+        self.metadata_path.mkdir(exist_ok=True)
+        self.content_path.mkdir(exist_ok=True)
+        # In-memory cache for frequently accessed documents
+        self._cache = {}
+        self._cache_size_limit = 100
+    async def store_document(self, document: Document) -> bool:
+        """Store a document and its metadata"""
+        try:
+            # Store metadata
+            metadata_file = self.metadata_path / f"{document.id}.json"
+            metadata = {
+                "id": document.id,
+                "filename": document.filename,
+                "doc_type": document.doc_type.value,
+                "file_size": document.file_size,
+                "created_at": document.created_at.isoformat(),
+                "metadata": document.metadata,
+                "tags": document.tags,
+                "summary": document.summary,
+                "category": document.category,
+                "language": document.language,
+                "content_length": len(document.content)
+            }
+            with open(metadata_file, 'w', encoding='utf-8') as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            # Store content separately (can be large)
+            content_file = self.content_path / f"{document.id}.txt"
+            with open(content_file, 'w', encoding='utf-8') as f:
+                f.write(document.content)
+            # Cache the document
+            self._add_to_cache(document.id, document)
+            logger.info(f"Stored document {document.id} ({document.filename})")
+            return True
+        except Exception as e:
+            logger.error(f"Error storing document {document.id}: {str(e)}")
+            return False
+    async def get_document(self, document_id: str) -> Optional[Document]:
+        """Retrieve a document by ID"""
+        try:
+            # Check cache first
+            if document_id in self._cache:
+                return self._cache[document_id]
+            # Load from disk
+            metadata_file = self.metadata_path / f"{document_id}.json"
+            content_file = self.content_path / f"{document_id}.txt"
+            if not metadata_file.exists() or not content_file.exists():
+                return None
+            # Load metadata
+            with open(metadata_file, 'r', encoding='utf-8') as f:
+                metadata = json.load(f)
+            # Load content
+            with open(content_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+            # Create document object
+            document = Document(
+                id=metadata["id"],
+                filename=metadata["filename"],
+                content=content,
+                doc_type=DocumentType(metadata["doc_type"]),
+                file_size=metadata["file_size"],
+                created_at=datetime.fromisoformat(metadata["created_at"]),
+                metadata=metadata.get("metadata", {}),
+                tags=metadata.get("tags", []),
+                summary=metadata.get("summary"),
+                category=metadata.get("category"),
+                language=metadata.get("language")
+            )
+            # Add to cache
+            self._add_to_cache(document_id, document)
+            return document
+        except Exception as e:
+            logger.error(f"Error retrieving document {document_id}: {str(e)}")
+            return None
+    async def list_documents(self, limit: int = 50, offset: int = 0,
+                           filters: Optional[Dict[str, Any]] = None) -> List[Document]:
+        """List documents with pagination and filtering"""
+        try:
+            documents = []
+            metadata_files = list(self.metadata_path.glob("*.json"))
+            # Sort by creation time (newest first)
+            metadata_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
+            # Apply pagination
+            start_idx = offset
+            end_idx = offset + limit
+            for metadata_file in metadata_files[start_idx:end_idx]:
+                try:
+                    with open(metadata_file, 'r', encoding='utf-8') as f:
+                        metadata = json.load(f)
+                    # Apply filters
+                    if filters and not self._apply_filters(metadata, filters):
+                        continue
+                    # Load content if needed (for small documents)
+                    content_file = self.content_path / f"{metadata['id']}.txt"
+                    if content_file.exists():
+                        with open(content_file, 'r', encoding='utf-8') as f:
+                            content = f.read()
+                    else:
+                        content = ""
+                    document = Document(
+                        id=metadata["id"],
+                        filename=metadata["filename"],
+                        content=content,
+                        doc_type=DocumentType(metadata["doc_type"]),
+                        file_size=metadata["file_size"],
+                        created_at=datetime.fromisoformat(metadata["created_at"]),
+                        metadata=metadata.get("metadata", {}),
+                        tags=metadata.get("tags", []),
+                        summary=metadata.get("summary"),
+                        category=metadata.get("category"),
+                        language=metadata.get("language")
+                    )
+                    documents.append(document)
+                except Exception as e:
+                    logger.warning(f"Error loading document metadata from {metadata_file}: {str(e)}")
+                    continue
+            return documents
+        except Exception as e:
+            logger.error(f"Error listing documents: {str(e)}")
+            return []
+    def _apply_filters(self, metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
+        """Apply filters to document metadata"""
+        try:
+            for key, value in filters.items():
+                if key == "doc_type":
+                    if metadata.get("doc_type") != value:
+                        return False
+                elif key == "filename_contains":
+                    if value.lower() not in metadata.get("filename", "").lower():
+                        return False
+                elif key == "created_after":
+                    doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
+                    if doc_date < value:
+                        return False
+                elif key == "created_before":
+                    doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
+                    if doc_date > value:
+                        return False
+                elif key == "tags":
+                    doc_tags = set(metadata.get("tags", []))
+                    required_tags = set(value) if isinstance(value, list) else {value}
+                    if not required_tags.intersection(doc_tags):
+                        return False
+                elif key == "category":
+                    if metadata.get("category") != value:
+                        return False
+                elif key == "language":
+                    if metadata.get("language") != value:
+                        return False
+            return True
+        except Exception as e:
+            logger.error(f"Error applying filters: {str(e)}")
+            return True
+    async def update_document_metadata(self, document_id: str, updates: Dict[str, Any]) -> bool:
+        """Update document metadata"""
+        try:
+            metadata_file = self.metadata_path / f"{document_id}.json"
+            if not metadata_file.exists():
+                logger.warning(f"Document {document_id} not found")
+                return False
+            # Load existing metadata
+            with open(metadata_file, 'r', encoding='utf-8') as f:
+                metadata = json.load(f)
+            # Apply updates
+            for key, value in updates.items():
+                if key in ["tags", "summary", "category", "language", "metadata"]:
+                    metadata[key] = value
+            # Save updated metadata
+            with open(metadata_file, 'w', encoding='utf-8') as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            # Update cache if document is cached
+            if document_id in self._cache:
+                document = self._cache[document_id]
+                for key, value in updates.items():
+                    if hasattr(document, key):
+                        setattr(document, key, value)
+            logger.info(f"Updated metadata for document {document_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Error updating document metadata: {str(e)}")
+            return False
+    async def delete_document(self, document_id: str) -> bool:
+        """Delete a document and its metadata"""
+        try:
+            metadata_file = self.metadata_path / f"{document_id}.json"
+            content_file = self.content_path / f"{document_id}.txt"
+            # Remove files
+            if metadata_file.exists():
+                metadata_file.unlink()
+            if content_file.exists():
+                content_file.unlink()
+            # Remove from cache
+            if document_id in self._cache:
+                del self._cache[document_id]
+            logger.info(f"Deleted document {document_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting document {document_id}: {str(e)}")
+            return False
+    async def search_documents(self, query: str, fields: List[str] = None) -> List[Document]:
+        """Simple text search across documents"""
+        if not fields:
+            fields = ["filename", "content", "tags", "summary"]
+        try:
+            matching_documents = []
+            query_lower = query.lower()
+            # Get all documents
+            all_documents = await self.list_documents(limit=1000)  # Adjust limit as needed
+            for document in all_documents:
+                match_found = False
+                for field in fields:
+                    field_value = getattr(document, field, "")
+                    if isinstance(field_value, list):
+                        field_value = " ".join(field_value)
+                    elif field_value is None:
+                        field_value = ""
+                    if query_lower in str(field_value).lower():
+                        match_found = True
+                        break
+                if match_found:
+                    matching_documents.append(document)
+            logger.info(f"Found {len(matching_documents)} documents matching '{query}'")
+            return matching_documents
+        except Exception as e:
+            logger.error(f"Error searching documents: {str(e)}")
+            return []
+    def _add_to_cache(self, document_id: str, document: Document):
+        """Add document to cache with size limit"""
+        try:
+            # Remove oldest items if cache is full
+            if len(self._cache) >= self._cache_size_limit:
+                # Remove first item (FIFO)
+                oldest_key = next(iter(self._cache))
+                del self._cache[oldest_key]
+            self._cache[document_id] = document
+        except Exception as e:
+            logger.error(f"Error adding to cache: {str(e)}")
+    async def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the document store"""
+        try:
+            metadata_files = list(self.metadata_path.glob("*.json"))
+            content_files = list(self.content_path.glob("*.txt"))
+            # Calculate total storage size
+            total_size = 0
+            for file_path in metadata_files + content_files:
+                total_size += file_path.stat().st_size
+            # Count by document type
+            type_counts = {}
+            for metadata_file in metadata_files:
+                try:
+                    with open(metadata_file, 'r') as f:
+                        metadata = json.load(f)
+                    doc_type = metadata.get("doc_type", "unknown")
+                    type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
+                except:
+                    continue
+            return {
+                "total_documents": len(metadata_files),
+                "total_size_bytes": total_size,
+                "total_size_mb": round(total_size / (1024 * 1024), 2),
+                "cache_size": len(self._cache),
+                "document_types": type_counts,
+                "storage_path": str(self.store_path),
+                "metadata_files": len(metadata_files),
+                "content_files": len(content_files)
+            }
+        except Exception as e:
+            logger.error(f"Error getting document store stats: {str(e)}")
+            return {"error": str(e)}

services/embedding_service.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import logging
+import asyncio
+from typing import List, Optional, Dict, Any
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import torch
+import config
+logger = logging.getLogger(__name__)
+class EmbeddingService:
+    def __init__(self):
+        self.config = config.config
+        self.model_name = self.config.EMBEDDING_MODEL
+        self.model = None
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        # Load model lazily
+        self._load_model()
+    def _load_model(self):
+        """Load the embedding model"""
+        try:
+            logger.info(f"Loading embedding model: {self.model_name}")
+            self.model = SentenceTransformer(self.model_name, device=self.device)
+            logger.info(f"Embedding model loaded successfully on {self.device}")
+        except Exception as e:
+            logger.error(f"Failed to load embedding model: {str(e)}")
+            # Fallback to a smaller model
+            try:
+                self.model_name = "all-MiniLM-L6-v2"
+                self.model = SentenceTransformer(self.model_name, device=self.device)
+                logger.info(f"Loaded fallback embedding model: {self.model_name}")
+            except Exception as fallback_error:
+                logger.error(f"Failed to load fallback model: {str(fallback_error)}")
+                raise
+    async def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
+        """Generate embeddings for a list of texts"""
+        if not texts:
+            return []
+        if self.model is None:
+            raise RuntimeError("Embedding model not loaded")
+        try:
+            # Filter out empty texts
+            non_empty_texts = [text for text in texts if text and text.strip()]
+            if not non_empty_texts:
+                logger.warning("No non-empty texts provided for embedding")
+                return []
+            logger.info(f"Generating embeddings for {len(non_empty_texts)} texts")
+            # Process in batches to manage memory
+            all_embeddings = []
+            for i in range(0, len(non_empty_texts), batch_size):
+                batch = non_empty_texts[i:i + batch_size]
+                # Run embedding generation in thread pool to avoid blocking
+                loop = asyncio.get_event_loop()
+                batch_embeddings = await loop.run_in_executor(
+                    None,
+                    self._generate_batch_embeddings,
+                    batch
+                )
+                all_embeddings.extend(batch_embeddings)
+            logger.info(f"Generated {len(all_embeddings)} embeddings")
+            return all_embeddings
+        except Exception as e:
+            logger.error(f"Error generating embeddings: {str(e)}")
+            raise
+    def _generate_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a batch of texts (synchronous)"""
+        try:
+            # Generate embeddings
+            embeddings = self.model.encode(
+                texts,
+                convert_to_numpy=True,
+                normalize_embeddings=True,
+                batch_size=len(texts)
+            )
+            # Convert to list of lists
+            return embeddings.tolist()
+        except Exception as e:
+            logger.error(f"Error in batch embedding generation: {str(e)}")
+            raise
+    async def generate_single_embedding(self, text: str) -> Optional[List[float]]:
+        """Generate embedding for a single text"""
+        if not text or not text.strip():
+            return None
+        try:
+            embeddings = await self.generate_embeddings([text])
+            return embeddings[0] if embeddings else None
+        except Exception as e:
+            logger.error(f"Error generating single embedding: {str(e)}")
+            return None
+    def get_embedding_dimension(self) -> int:
+        """Get the dimension of embeddings produced by the model"""
+        if self.model is None:
+            raise RuntimeError("Embedding model not loaded")
+        return self.model.get_sentence_embedding_dimension()
+    def compute_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
+        """Compute cosine similarity between two embeddings"""
+        try:
+            # Convert to numpy arrays
+            emb1 = np.array(embedding1)
+            emb2 = np.array(embedding2)
+            # Compute cosine similarity
+            similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
+            return float(similarity)
+        except Exception as e:
+            logger.error(f"Error computing similarity: {str(e)}")
+            return 0.0
+    def compute_similarities(self, query_embedding: List[float], embeddings: List[List[float]]) -> List[float]:
+        """Compute similarities between a query embedding and multiple embeddings"""
+        try:
+            query_emb = np.array(query_embedding)
+            emb_matrix = np.array(embeddings)
+            # Compute cosine similarities
+            similarities = np.dot(emb_matrix, query_emb) / (
+                np.linalg.norm(emb_matrix, axis=1) * np.linalg.norm(query_emb)
+            )
+            return similarities.tolist()
+        except Exception as e:
+            logger.error(f"Error computing similarities: {str(e)}")
+            return [0.0] * len(embeddings)
+    async def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Embed a list of chunks and add embeddings to them"""
+        if not chunks:
+            return []
+        try:
+            # Extract texts
+            texts = [chunk.get('content', '') for chunk in chunks]
+            # Generate embeddings
+            embeddings = await self.generate_embeddings(texts)
+            # Add embeddings to chunks
+            embedded_chunks = []
+            for i, chunk in enumerate(chunks):
+                if i < len(embeddings):
+                    chunk_copy = chunk.copy()
+                    chunk_copy['embedding'] = embeddings[i]
+                    embedded_chunks.append(chunk_copy)
+                else:
+                    logger.warning(f"No embedding generated for chunk {i}")
+                    embedded_chunks.append(chunk)
+            return embedded_chunks
+        except Exception as e:
+            logger.error(f"Error embedding chunks: {str(e)}")
+            raise
+    def validate_embedding(self, embedding: List[float]) -> bool:
+        """Validate that an embedding is properly formatted"""
+        try:
+            if not embedding:
+                return False
+            if not isinstance(embedding, list):
+                return False
+            if len(embedding) != self.get_embedding_dimension():
+                return False
+            # Check for NaN or infinite values
+            emb_array = np.array(embedding)
+            if np.isnan(emb_array).any() or np.isinf(emb_array).any():
+                return False
+            return True
+        except Exception:
+            return False
+    async def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model"""
+        try:
+            return {
+                "model_name": self.model_name,
+                "device": self.device,
+                "embedding_dimension": self.get_embedding_dimension(),
+                "max_sequence_length": getattr(self.model, 'max_seq_length', 'unknown'),
+                "model_loaded": self.model is not None
+            }
+        except Exception as e:
+            logger.error(f"Error getting model info: {str(e)}")
+            return {"error": str(e)}

services/llm_service.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import logging
+import asyncio
+from typing import List, Dict, Any, Optional
+import anthropic
+from mistralai.client import MistralClient
+import config
+logger = logging.getLogger(__name__)
+class LLMService:
+    def __init__(self):
+        self.config = config.config
+        # Initialize clients
+        self.anthropic_client = None
+        self.mistral_client = None
+        self._initialize_clients()
+    def _initialize_clients(self):
+        """Initialize LLM clients"""
+        try:
+            if self.config.ANTHROPIC_API_KEY:
+                self.anthropic_client = anthropic.Anthropic(
+                    api_key=self.config.ANTHROPIC_API_KEY
+                )
+                logger.info("Anthropic client initialized")
+            if self.config.MISTRAL_API_KEY:
+                self.mistral_client = MistralClient(
+                    api_key=self.config.MISTRAL_API_KEY
+                )
+                logger.info("Mistral client initialized")
+            if not self.anthropic_client and not self.mistral_client:
+                raise ValueError("No LLM clients could be initialized. Check API keys.")
+        except Exception as e:
+            logger.error(f"Error initializing LLM clients: {str(e)}")
+            raise
+    async def generate_text(self, prompt: str, model: str = "auto", max_tokens: int = 1000, temperature: float = 0.7) -> str:
+        """Generate text using the specified model"""
+        try:
+            if model == "auto":
+                # Use Claude if available, otherwise Mistral
+                if self.anthropic_client:
+                    return await self._generate_with_claude(prompt, max_tokens, temperature)
+                elif self.mistral_client:
+                    return await self._generate_with_mistral(prompt, max_tokens, temperature)
+                else:
+                    raise ValueError("No LLM clients available")
+            elif model.startswith("claude"):
+                if not self.anthropic_client:
+                    raise ValueError("Anthropic client not available")
+                return await self._generate_with_claude(prompt, max_tokens, temperature)
+            elif model.startswith("mistral"):
+                if not self.mistral_client:
+                    raise ValueError("Mistral client not available")
+                return await self._generate_with_mistral(prompt, max_tokens, temperature)
+            else:
+                raise ValueError(f"Unsupported model: {model}")
+        except Exception as e:
+            logger.error(f"Error generating text: {str(e)}")
+            raise
+    async def _generate_with_claude(self, prompt: str, max_tokens: int, temperature: float) -> str:
+        """Generate text using Claude"""
+        try:
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(
+                None,
+                lambda: self.anthropic_client.messages.create(
+                    model=self.config.ANTHROPIC_MODEL,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    messages=[
+                        {"role": "user", "content": prompt}
+                    ]
+                )
+            )
+            return response.content[0].text
+        except Exception as e:
+            logger.error(f"Error with Claude generation: {str(e)}")
+            raise
+    async def _generate_with_mistral(self, prompt: str, max_tokens: int, temperature: float) -> str:
+        """Generate text using Mistral"""
+        try:
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(
+                None,
+                lambda: self.mistral_client.chat(
+                    model=self.config.MISTRAL_MODEL,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=max_tokens,
+                    temperature=temperature
+                )
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logger.error(f"Error with Mistral generation: {str(e)}")
+            raise
+    async def summarize(self, text: str, style: str = "concise", max_length: Optional[int] = None) -> str:
+        """Generate a summary of the given text"""
+        if not text.strip():
+            return ""
+        # Create style-specific prompts
+        style_prompts = {
+            "concise": "Provide a concise summary of the following text, focusing on the main points:",
+            "detailed": "Provide a detailed summary of the following text, including key details and supporting information:",
+            "bullet_points": "Summarize the following text as a list of bullet points highlighting the main ideas:",
+            "executive": "Provide an executive summary of the following text, focusing on key findings and actionable insights:"
+        }
+        prompt_template = style_prompts.get(style, style_prompts["concise"])
+        if max_length:
+            prompt_template += f" Keep the summary under {max_length} words."
+        prompt = f"{prompt_template}\n\nText to summarize:\n{text}\n\nSummary:"
+        try:
+            summary = await self.generate_text(prompt, max_tokens=500, temperature=0.3)
+            return summary.strip()
+        except Exception as e:
+            logger.error(f"Error generating summary: {str(e)}")
+            return "Error generating summary"
+    async def generate_tags(self, text: str, max_tags: int = 5) -> List[str]:
+        """Generate relevant tags for the given text"""
+        if not text.strip():
+            return []
+        prompt = f"""Generate {max_tags} relevant tags for the following text.
+        Tags should be concise, descriptive keywords or phrases that capture the main topics, themes, or concepts.
+        Return only the tags, separated by commas.
+        Text:
+        {text}
+        Tags:"""
+        try:
+            response = await self.generate_text(prompt, max_tokens=100, temperature=0.5)
+            # Parse tags from response
+            tags = [tag.strip() for tag in response.split(',')]
+            tags = [tag for tag in tags if tag and len(tag) > 1]
+            return tags[:max_tags]
+        except Exception as e:
+            logger.error(f"Error generating tags: {str(e)}")
+            return []
+    async def categorize(self, text: str, categories: List[str]) -> str:
+        """Categorize text into one of the provided categories"""
+        if not text.strip() or not categories:
+            return "Uncategorized"
+        categories_str = ", ".join(categories)
+        prompt = f"""Classify the following text into one of these categories: {categories_str}
+        Choose the most appropriate category based on the content and main theme of the text.
+        Return only the category name, nothing else.
+        Text to classify:
+        {text}
+        Category:"""
+        try:
+            response = await self.generate_text(prompt, max_tokens=50, temperature=0.1)
+            category = response.strip()
+            # Validate that the response is one of the provided categories
+            if category in categories:
+                return category
+            else:
+                # Try to find a close match
+                category_lower = category.lower()
+                for cat in categories:
+                    if cat.lower() in category_lower or category_lower in cat.lower():
+                        return cat
+                return categories[0] if categories else "Uncategorized"
+        except Exception as e:
+            logger.error(f"Error categorizing text: {str(e)}")
+            return "Uncategorized"
+    async def answer_question(self, question: str, context: str, max_context_length: int = 2000) -> str:
+        """Answer a question based on the provided context"""
+        if not question.strip():
+            return "No question provided"
+        if not context.strip():
+            return "I don't have enough context to answer this question. Please provide more relevant information."
+        # Truncate context if too long
+        if len(context) > max_context_length:
+            context = context[:max_context_length] + "..."
+        prompt = f"""Based on the following context, answer the question. If the context doesn't contain enough information to answer the question completely, say so and provide what information you can.
+        Context:
+        {context}
+        Question: {question}
+        Answer:"""
+        try:
+            answer = await self.generate_text(prompt, max_tokens=300, temperature=0.3)
+            return answer.strip()
+        except Exception as e:
+            logger.error(f"Error answering question: {str(e)}")
+            return "I encountered an error while trying to answer your question."
+    async def extract_key_information(self, text: str) -> Dict[str, Any]:
+        """Extract key information from text"""
+        if not text.strip():
+            return {}
+        prompt = f"""Analyze the following text and extract key information. Provide the response in the following format:
+        Main Topic: [main topic or subject]
+        Key Points: [list 3-5 key points]
+        Entities: [important people, places, organizations mentioned]
+        Sentiment: [positive/neutral/negative]
+        Content Type: [article/document/email/report/etc.]
+        Text to analyze:
+        {text}
+        Analysis:"""
+        try:
+            response = await self.generate_text(prompt, max_tokens=400, temperature=0.4)
+            # Parse the structured response
+            info = {}
+            lines = response.split('\n')
+            for line in lines:
+                if ':' in line:
+                    key, value = line.split(':', 1)
+                    key = key.strip().lower().replace(' ', '_')
+                    value = value.strip()
+                    if value:
+                        info[key] = value
+            return info
+        except Exception as e:
+            logger.error(f"Error extracting key information: {str(e)}")
+            return {}
+    async def check_availability(self) -> Dict[str, bool]:
+        """Check which LLM services are available"""
+        availability = {
+            "anthropic": False,
+            "mistral": False
+        }
+        try:
+            if self.anthropic_client:
+                # Test Claude availability with a simple request
+                test_response = await self._generate_with_claude("Hello", 10, 0.1)
+                availability["anthropic"] = bool(test_response)
+        except:
+            pass
+        try:
+            if self.mistral_client:
+                # Test Mistral availability with a simple request
+                test_response = await self._generate_with_mistral("Hello", 10, 0.1)
+                availability["mistral"] = bool(test_response)
+        except:
+            pass
+        return availability

services/ocr_service.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import logging
+from typing import Optional, List, Dict, Any
+import asyncio
+from pathlib import Path
+import tempfile
+import os
+from PIL import Image
+import pytesseract
+import config
+logger = logging.getLogger(__name__)
+class OCRService:
+    def __init__(self):
+        self.config = config.config
+        # Configure Tesseract path if specified
+        if self.config.TESSERACT_PATH:
+            pytesseract.pytesseract.tesseract_cmd = self.config.TESSERACT_PATH
+        self.language = self.config.OCR_LANGUAGE
+        # Test OCR availability
+        self._test_ocr_availability()
+    def _test_ocr_availability(self):
+        """Test if OCR is available and working"""
+        try:
+            # Create a simple test image
+            test_image = Image.new('RGB', (100, 30), color='white')
+            pytesseract.image_to_string(test_image)
+            logger.info("OCR service initialized successfully")
+        except Exception as e:
+            logger.warning(f"OCR may not be available: {str(e)}")
+    async def extract_text_from_image(self, image_path: str, language: Optional[str] = None) -> str:
+        """Extract text from an image file"""
+        try:
+            # Use specified language or default
+            lang = language or self.language
+            # Load image
+            image = Image.open(image_path)
+            # Perform OCR in thread pool to avoid blocking
+            loop = asyncio.get_event_loop()
+            text = await loop.run_in_executor(
+                None,
+                self._extract_text_sync,
+                image,
+                lang
+            )
+            return text.strip()
+        except Exception as e:
+            logger.error(f"Error extracting text from image {image_path}: {str(e)}")
+            return ""
+    def _extract_text_sync(self, image: Image.Image, language: str) -> str:
+        """Synchronous text extraction"""
+        try:
+            # Optimize image for OCR
+            processed_image = self._preprocess_image(image)
+            # Configure OCR
+            config_string = '--psm 6'  # Assume a single uniform block of text
+            # Extract text
+            text = pytesseract.image_to_string(
+                processed_image,
+                lang=language,
+                config=config_string
+            )
+            return text
+        except Exception as e:
+            logger.error(f"Error in synchronous OCR: {str(e)}")
+            return ""
+    def _preprocess_image(self, image: Image.Image) -> Image.Image:
+        """Preprocess image to improve OCR accuracy"""
+        try:
+            # Convert to grayscale if not already
+            if image.mode != 'L':
+                image = image.convert('L')
+            # Resize image if too small (OCR works better on larger images)
+            width, height = image.size
+            if width < 300 or height < 300:
+                scale_factor = max(300 / width, 300 / height)
+                new_width = int(width * scale_factor)
+                new_height = int(height * scale_factor)
+                image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            return image
+        except Exception as e:
+            logger.error(f"Error preprocessing image: {str(e)}")
+            return image
+    async def extract_text_from_pdf_images(self, pdf_path: str) -> List[str]:
+        """Extract text from PDF by converting pages to images and running OCR"""
+        try:
+            import fitz  # PyMuPDF
+            texts = []
+            # Open PDF
+            pdf_document = fitz.open(pdf_path)
+            for page_num in range(len(pdf_document)):
+                try:
+                    # Get page
+                    page = pdf_document[page_num]
+                    # Convert page to image
+                    mat = fitz.Matrix(2.0, 2.0)  # Scale factor for better quality
+                    pix = page.get_pixmap(matrix=mat)
+                    img_data = pix.tobytes("ppm")
+                    # Create PIL image from bytes
+                    with tempfile.NamedTemporaryFile(suffix='.ppm', delete=False) as tmp_file:
+                        tmp_file.write(img_data)
+                        tmp_file.flush()
+                        # Extract text from image
+                        page_text = await self.extract_text_from_image(tmp_file.name)
+                        texts.append(page_text)
+                        # Clean up temporary file
+                        os.unlink(tmp_file.name)
+                except Exception as e:
+                    logger.warning(f"Error processing PDF page {page_num}: {str(e)}")
+                    texts.append("")
+            pdf_document.close()
+            return texts
+        except ImportError:
+            logger.error("PyMuPDF not available for PDF OCR")
+            return []
+        except Exception as e:
+            logger.error(f"Error extracting text from PDF images: {str(e)}")
+            return []
+    async def extract_text_with_confidence(self, image_path: str, min_confidence: float = 0.5) -> Dict[str, Any]:
+        """Extract text with confidence scores"""
+        try:
+            image = Image.open(image_path)
+            # Get detailed OCR data with confidence scores
+            loop = asyncio.get_event_loop()
+            ocr_data = await loop.run_in_executor(
+                None,
+                self._extract_detailed_data,
+                image
+            )
+            # Filter by confidence
+            filtered_text = []
+            word_confidences = []
+            for i, confidence in enumerate(ocr_data.get('conf', [])):
+                if confidence > min_confidence * 100:  # Tesseract uses 0-100 scale
+                    text = ocr_data.get('text', [])[i]
+                    if text.strip():
+                        filtered_text.append(text)
+                        word_confidences.append(confidence / 100.0)  # Convert to 0-1 scale
+            return {
+                "text": " ".join(filtered_text),
+                "confidence": sum(word_confidences) / len(word_confidences) if word_confidences else 0.0,
+                "word_count": len(filtered_text),
+                "raw_data": ocr_data
+            }
+        except Exception as e:
+            logger.error(f"Error extracting text with confidence: {str(e)}")
+            return {
+                "text": "",
+                "confidence": 0.0,
+                "word_count": 0,
+                "error": str(e)
+            }
+    def _extract_detailed_data(self, image: Image.Image) -> Dict[str, Any]:
+        """Extract detailed OCR data with positions and confidence"""
+        try:
+            processed_image = self._preprocess_image(image)
+            # Get detailed data
+            data = pytesseract.image_to_data(
+                processed_image,
+                lang=self.language,
+                config='--psm 6',
+                output_type=pytesseract.Output.DICT
+            )
+            return data
+        except Exception as e:
+            logger.error(f"Error extracting detailed OCR data: {str(e)}")
+            return {}
+    async def detect_language(self, image_path: str) -> str:
+        """Detect the language of text in an image"""
+        try:
+            image = Image.open(image_path)
+            # Run language detection
+            loop = asyncio.get_event_loop()
+            languages = await loop.run_in_executor(
+                None,
+                pytesseract.image_to_osd,
+                image
+            )
+            # Parse the output to get the language
+            for line in languages.split('\n'):
+                if 'Script:' in line:
+                    script = line.split(':')[1].strip()
+                    # Map script to language code
+                    script_to_lang = {
+                        'Latin': 'eng',
+                        'Arabic': 'ara',
+                        'Chinese': 'chi_sim',
+                        'Japanese': 'jpn',
+                        'Korean': 'kor'
+                    }
+                    return script_to_lang.get(script, 'eng')
+            return 'eng'  # Default to English
+        except Exception as e:
+            logger.error(f"Error detecting language: {str(e)}")
+            return 'eng'
+    async def extract_tables_from_image(self, image_path: str) -> List[List[str]]:
+        """Extract table data from an image"""
+        try:
+            # This is a basic implementation
+            # For better table extraction, consider using specialized libraries like table-transformer
+            image = Image.open(image_path)
+            # Use specific PSM for tables
+            loop = asyncio.get_event_loop()
+            text = await loop.run_in_executor(
+                None,
+                lambda: pytesseract.image_to_string(
+                    image,
+                    lang=self.language,
+                    config='--psm 6 -c preserve_interword_spaces=1'
+                )
+            )
+            # Simple table parsing (assumes space/tab separated)
+            lines = text.split('\n')
+            table_data = []
+            for line in lines:
+                if line.strip():
+                    # Split by multiple spaces or tabs
+                    cells = [cell.strip() for cell in line.split() if cell.strip()]
+                    if cells:
+                        table_data.append(cells)
+            return table_data
+        except Exception as e:
+            logger.error(f"Error extracting tables from image: {str(e)}")
+            return []
+    async def get_supported_languages(self) -> List[str]:
+        """Get list of supported OCR languages"""
+        try:
+            languages = pytesseract.get_languages()
+            return sorted(languages)
+        except Exception as e:
+            logger.error(f"Error getting supported languages: {str(e)}")
+            return ['eng']  # Default to English only
+    async def validate_ocr_setup(self) -> Dict[str, Any]:
+        """Validate OCR setup and return status"""
+        try:
+            # Test basic functionality
+            test_image = Image.new('RGB', (200, 50), color='white')
+            from PIL import ImageDraw, ImageFont
+            draw = ImageDraw.Draw(test_image)
+            try:
+                # Try to use a default font
+                draw.text((10, 10), "Test OCR", fill='black')
+            except:
+                # Fall back to basic text without font
+                draw.text((10, 10), "Test", fill='black')
+            # Test OCR
+            result = pytesseract.image_to_string(test_image)
+            # Get available languages
+            languages = await self.get_supported_languages()
+            return {
+                "status": "operational",
+                "tesseract_version": pytesseract.get_tesseract_version(),
+                "available_languages": languages,
+                "current_language": self.language,
+                "test_result": result.strip(),
+                "tesseract_path": pytesseract.pytesseract.tesseract_cmd
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "error": str(e),
+                "tesseract_path": pytesseract.pytesseract.tesseract_cmd
+            }
+    def extract_text(self, file_path):
+        # Dummy implementation for OCR
+        return "OCR functionality not implemented yet."

services/vector_store_service.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import logging
+import os
+import pickle
+import numpy as np
+from typing import List, Dict, Any, Optional, Tuple
+import faiss
+from pathlib import Path
+import asyncio
+import json
+from core.models import SearchResult, Chunk
+import config
+logger = logging.getLogger(__name__)
+class VectorStoreService:
+    def __init__(self):
+        self.config = config.config
+        self.index = None
+        self.chunks_metadata = {}  # Maps index position to chunk metadata
+        self.dimension = None
+        # Paths
+        self.store_path = Path(self.config.VECTOR_STORE_PATH)
+        self.store_path.mkdir(parents=True, exist_ok=True)
+        self.index_path = self.store_path / f"{self.config.INDEX_NAME}.index"
+        self.metadata_path = self.store_path / f"{self.config.INDEX_NAME}_metadata.json"
+        # Load existing index if available
+        self._load_index()
+    def _load_index(self):
+        """Load existing FAISS index and metadata"""
+        try:
+            if self.index_path.exists() and self.metadata_path.exists():
+                logger.info("Loading existing FAISS index...")
+                # Load FAISS index
+                self.index = faiss.read_index(str(self.index_path))
+                self.dimension = self.index.d
+                # Load metadata
+                with open(self.metadata_path, 'r') as f:
+                    self.chunks_metadata = json.load(f)
+                logger.info(f"Loaded index with {self.index.ntotal} vectors, dimension {self.dimension}")
+            else:
+                logger.info("No existing index found, will create new one")
+        except Exception as e:
+            logger.error(f"Error loading index: {str(e)}")
+    def _initialize_index(self, dimension: int):
+        """Initialize a new FAISS index"""
+        try:
+            # Use IndexFlatIP for cosine similarity (since embeddings are normalized)
+            self.index = faiss.IndexFlatIP(dimension)
+            self.dimension = dimension
+            self.chunks_metadata = {}
+            logger.info(f"Initialized new FAISS index with dimension {dimension}")
+        except Exception as e:
+            logger.error(f"Error initializing index: {str(e)}")
+            raise
+    async def add_chunks(self, chunks: List[Chunk]) -> bool:
+        """Add chunks to the vector store"""
+        if not chunks:
+            return True
+        try:
+            # Extract embeddings and metadata
+            embeddings = []
+            new_metadata = {}
+            for chunk in chunks:
+                if chunk.embedding and len(chunk.embedding) > 0:
+                    embeddings.append(chunk.embedding)
+                    # Store metadata using the current index position
+                    current_index = len(self.chunks_metadata) + len(embeddings) - 1
+                    new_metadata[str(current_index)] = {
+                        "chunk_id": chunk.id,
+                        "document_id": chunk.document_id,
+                        "content": chunk.content,
+                        "chunk_index": chunk.chunk_index,
+                        "start_pos": chunk.start_pos,
+                        "end_pos": chunk.end_pos,
+                        "metadata": chunk.metadata
+                    }
+            if not embeddings:
+                logger.warning("No valid embeddings found in chunks")
+                return False
+            # Initialize index if needed
+            if self.index is None:
+                self._initialize_index(len(embeddings[0]))
+            # Convert to numpy array
+            embeddings_array = np.array(embeddings, dtype=np.float32)
+            # Add to FAISS index
+            self.index.add(embeddings_array)
+            # Update metadata
+            self.chunks_metadata.update(new_metadata)
+            # Save index and metadata
+            await self._save_index()
+            logger.info(f"Added {len(embeddings)} chunks to vector store")
+            return True
+        except Exception as e:
+            logger.error(f"Error adding chunks to vector store: {str(e)}")
+            return False
+    async def search(self, query_embedding: List[float], top_k: int = 5,
+                    filters: Optional[Dict[str, Any]] = None) -> List[SearchResult]:
+        """Search for similar chunks"""
+        if self.index is None or self.index.ntotal == 0:
+            logger.warning("No index available or index is empty")
+            return []
+        try:
+            # Convert query embedding to numpy array
+            query_array = np.array([query_embedding], dtype=np.float32)
+            # Perform search
+            scores, indices = self.index.search(query_array, min(top_k, self.index.ntotal))
+            # Convert results to SearchResult objects
+            results = []
+            for score, idx in zip(scores[0], indices[0]):
+                if idx == -1:  # FAISS returns -1 for empty slots
+                    continue
+                chunk_metadata = self.chunks_metadata.get(str(idx))
+                if chunk_metadata:
+                    # Apply filters if specified
+                    if filters and not self._apply_filters(chunk_metadata, filters):
+                        continue
+                    result = SearchResult(
+                        chunk_id=chunk_metadata["chunk_id"],
+                        document_id=chunk_metadata["document_id"],
+                        content=chunk_metadata["content"],
+                        score=float(score),
+                        metadata=chunk_metadata.get("metadata", {})
+                    )
+                    results.append(result)
+            # Sort by score (descending)
+            results.sort(key=lambda x: x.score, reverse=True)
+            logger.info(f"Found {len(results)} search results")
+            return results
+        except Exception as e:
+            logger.error(f"Error searching vector store: {str(e)}")
+            return []
+    def _apply_filters(self, chunk_metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
+        """Apply filters to chunk metadata"""
+        try:
+            for key, value in filters.items():
+                if key == "document_id":
+                    if chunk_metadata.get("document_id") != value:
+                        return False
+                elif key == "document_ids":
+                    if chunk_metadata.get("document_id") not in value:
+                        return False
+                elif key == "content_length_min":
+                    if len(chunk_metadata.get("content", "")) < value:
+                        return False
+                elif key == "content_length_max":
+                    if len(chunk_metadata.get("content", "")) > value:
+                        return False
+                # Add more filter types as needed
+            return True
+        except Exception as e:
+            logger.error(f"Error applying filters: {str(e)}")
+            return True
+    async def _save_index(self):
+        """Save the FAISS index and metadata to disk"""
+        try:
+            if self.index is not None:
+                # Save FAISS index
+                faiss.write_index(self.index, str(self.index_path))
+                # Save metadata
+                with open(self.metadata_path, 'w') as f:
+                    json.dump(self.chunks_metadata, f, indent=2)
+                logger.debug("Saved index and metadata to disk")
+        except Exception as e:
+            logger.error(f"Error saving index: {str(e)}")
+    async def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the vector store"""
+        try:
+            return {
+                "total_vectors": self.index.ntotal if self.index else 0,
+                "dimension": self.dimension,
+                "index_type": type(self.index).__name__ if self.index else None,
+                "metadata_entries": len(self.chunks_metadata),
+                "index_file_exists": self.index_path.exists(),
+                "metadata_file_exists": self.metadata_path.exists()
+            }
+        except Exception as e:
+            logger.error(f"Error getting stats: {str(e)}")
+            return {"error": str(e)}
+    async def delete_document(self, document_id: str) -> bool:
+        """Delete all chunks for a specific document"""
+        try:
+            # Find indices to remove
+            indices_to_remove = []
+            for idx, metadata in self.chunks_metadata.items():
+                if metadata.get("document_id") == document_id:
+                    indices_to_remove.append(int(idx))
+            if not indices_to_remove:
+                logger.warning(f"No chunks found for document {document_id}")
+                return False
+            # FAISS doesn't support removing individual vectors efficiently
+            # We need to rebuild the index without the removed vectors
+            if self.index and self.index.ntotal > 0:
+                # Get all embeddings except the ones to remove
+                all_embeddings = []
+                new_metadata = {}
+                new_index = 0
+                for old_idx in range(self.index.ntotal):
+                    if old_idx not in indices_to_remove:
+                        # Get the embedding from FAISS
+                        embedding = self.index.reconstruct(old_idx)
+                        all_embeddings.append(embedding)
+                        # Update metadata with new index
+                        old_metadata = self.chunks_metadata.get(str(old_idx))
+                        if old_metadata:
+                            new_metadata[str(new_index)] = old_metadata
+                            new_index += 1
+                # Rebuild index
+                if all_embeddings:
+                    self._initialize_index(self.dimension)
+                    embeddings_array = np.array(all_embeddings, dtype=np.float32)
+                    self.index.add(embeddings_array)
+                    self.chunks_metadata = new_metadata
+                else:
+                    # No embeddings left, create empty index
+                    self._initialize_index(self.dimension)
+                # Save updated index
+                await self._save_index()
+            logger.info(f"Deleted {len(indices_to_remove)} chunks for document {document_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting document chunks: {str(e)}")
+            return False
+    async def clear_all(self) -> bool:
+        """Clear all data from the vector store"""
+        try:
+            self.index = None
+            self.chunks_metadata = {}
+            self.dimension = None
+            # Remove files
+            if self.index_path.exists():
+                self.index_path.unlink()
+            if self.metadata_path.exists():
+                self.metadata_path.unlink()
+            logger.info("Cleared all data from vector store")
+            return True
+        except Exception as e:
+            logger.error(f"Error clearing vector store: {str(e)}")
+            return False