Nihal2000 commited on
Commit
8ba2581
·
1 Parent(s): 3e772ec

Server initialization

Browse files
.gitignore CHANGED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore environment variables
2
+ .env
3
+
4
+ # Ignore Python cache files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *.pyo
8
+ *.pyd
9
+
10
+ # Ignore Jupyter notebooks checkpoints
11
+ .ipynb_checkpoints/
12
+
13
+ # Ignore virtual environment folders
14
+ env/
15
+ venv/
16
+ ENV/
17
+ VENV/
18
+
19
+ # Ignore VSCode-specific files
20
+ .vscode/
21
+
22
+ # Ignore OS-specific files
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Ignore database or app data
27
+ db/
28
+ *.sqlite3
29
+
30
+ # Ignore Gradio temp files
31
+ gradio_cached_examples/
32
+ tmp/
33
+ *.log
app.py CHANGED
@@ -1,157 +1,254 @@
1
- import os
2
- import uuid
3
  import gradio as gr
4
- from gradio import components
5
- from fastmcp import FastMCP
6
- # from core.parser import parse_document, parse_url
7
- from core.parser import parse_document, parse_url
8
- from core.summarizer import summarize_content, tag_content
9
- from core.storage import add_document, search_documents
10
- from core.agent import answer_question
11
- # from core.components import DocumentViewer
12
- import plotly.graph_objects as go
13
-
14
- # Initialize the FastMCP server (for agentic tools)
15
- mcp = FastMCP("IntelligentContentOrganizer")
16
-
17
- # Gradio UI functions
18
- def process_content(file_obj, url, tags_input):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  """
20
- Handle file upload or URL input: parse content, summarize, tag, store.
21
- """
22
- content_text = ""
23
- source = ""
24
- if file_obj is not None:
25
- # Save uploaded file to temp path
26
- file_path = file_obj.name
27
- content_text = parse_document(file_path)
28
- source = file_obj.name
29
- elif url:
30
- content_text = parse_url(url)
31
- source = url
32
- else:
33
- return "No document provided.", "", "", ""
34
-
35
- # Summarize and tag (simulated)
36
- summary = summarize_content(content_text)
37
- tags = tag_content(content_text)
38
-
39
- # Allow user to override or confirm tags via input
40
- if tags_input:
41
- # If user entered new tags, split by comma
42
- tags = [t.strip() for t in tags_input.split(",") if t.strip() != ""]
43
-
44
- # Store in ChromaDB with a unique ID
45
- doc_id = str(uuid.uuid4())
46
- metadata = {"source": source, "tags": tags}
47
- add_document(doc_id, content_text, metadata)
48
-
49
- return content_text, summary, ", ".join(tags), f"Document stored with ID: {doc_id}"
50
-
51
- def generate_graph():
52
- """
53
- Create a simple Plotly graph of documents.
54
- Nodes = documents, edges = shared tags.
55
- """
56
- # Fetch all documents from ChromaDB
57
- from core.storage import get_all_documents
58
- docs = get_all_documents()
59
- if not docs:
60
- return go.Figure() # empty
61
-
62
- # Build graph connections: if two docs share a tag, connect them
63
- nodes = {doc["id"]: doc for doc in docs}
64
- edges = []
65
- for i, doc1 in enumerate(docs):
66
- for doc2 in docs[i+1:]:
67
- shared_tags = set(doc1["metadata"]["tags"]) & set(doc2["metadata"]["tags"])
68
- if shared_tags:
69
- edges.append((doc1["id"], doc2["id"]))
70
-
71
- # Use networkx to compute layout (or simple fixed positions)
72
- import networkx as nx
73
- G = nx.Graph()
74
- G.add_nodes_from(nodes.keys())
75
- G.add_edges_from(edges)
76
- pos = nx.spring_layout(G, seed=42)
77
-
78
- # Create Plotly traces
79
- edge_x = []
80
- edge_y = []
81
- for (src, dst) in edges:
82
- x0, y0 = pos[src]
83
- x1, y1 = pos[dst]
84
- edge_x += [x0, x1, None]
85
- edge_y += [y0, y1, None]
86
- edge_trace = go.Scatter(
87
- x=edge_x, y=edge_y,
88
- line=dict(width=1, color='#888'),
89
- hoverinfo='none',
90
- mode='lines')
91
-
92
- node_x = []
93
- node_y = []
94
- node_text = []
95
- for node_id in G.nodes():
96
- x, y = pos[node_id]
97
- node_x.append(x)
98
- node_y.append(y)
99
- text = nodes[node_id]["metadata"].get("source", "")
100
- node_text.append(f"{text}\nTags: {nodes[node_id]['metadata']['tags']}")
101
-
102
- node_trace = go.Scatter(
103
- x=node_x, y=node_y,
104
- mode='markers+text',
105
- marker=dict(size=10, color='skyblue'),
106
- text=node_text, hoverinfo='text', textposition="bottom center")
107
-
108
- fig = go.Figure(data=[edge_trace, node_trace],
109
- layout=go.Layout(title="Document Knowledge Graph",
110
- showlegend=False,
111
- margin=dict(l=20, r=20, b=20, t=30)))
112
- return fig
113
-
114
- def handle_query(question):
115
- """
116
- Answer a user question by retrieving relevant documents and summarizing them.
117
- """
118
- if not question:
119
- return "Please enter a question."
120
-
121
- answer = answer_question(question)
122
- return answer
123
-
124
- # Build Gradio interface with Blocks
125
- with gr.Blocks(title="Intelligent Content Organizer") as demo:
126
- gr.Markdown("# Intelligent Content Organizer")
127
- with gr.Tab("Upload / Fetch Content"):
128
- gr.Markdown("**Add a document:** Upload a file or enter a URL.")
129
- with gr.Row():
130
- file_in = gr.File(label="Upload Document (PDF, TXT, etc.)")
131
- url_in = gr.Textbox(label="Document URL", placeholder="https://example.com/article")
132
- tags_in = gr.Textbox(label="Tags (comma-separated)", placeholder="Enter tags or leave blank")
133
- process_btn = gr.Button("Parse & Add Document")
134
- doc_view = gr.Textbox(label="Document Preview", lines=10, interactive=False)
135
- summary_out = gr.Textbox(label="Summary", interactive=False)
136
- tags_out = gr.Textbox(label="Detected Tags", interactive=False)
137
- status_out = gr.Textbox(label="Status/Info", interactive=False)
138
- process_btn.click(fn=process_content, inputs=[file_in, url_in, tags_in],
139
- outputs=[doc_view, summary_out, tags_out, status_out])
140
-
141
- with gr.Tab("Knowledge Graph"):
142
- gr.Markdown("**Document relationships:** Shared tags indicate edges.")
143
- graph_plot = gr.Plot(label="Knowledge Graph")
144
- refresh_btn = gr.Button("Refresh Graph")
145
- refresh_btn.click(fn=generate_graph, inputs=None, outputs=graph_plot)
146
-
147
- with gr.Tab("Ask a Question"):
148
- gr.Markdown("**AI Q&A:** Ask a question about your documents.")
149
- question_in = gr.Textbox(label="Your Question")
150
- answer_out = gr.Textbox(label="Answer", interactive=False)
151
- ask_btn = gr.Button("Get Answer")
152
- ask_btn.click(fn=handle_query, inputs=question_in, outputs=answer_out)
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  if __name__ == "__main__":
155
- # Launch Gradio app (Hugging Face Spaces will auto-launch this)
156
- # demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
157
- demo.launch(mcp_server=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import asyncio
3
+ from pathlib import Path
4
+ import tempfile
5
+ import json
6
+ from typing import List, Dict, Any
7
+ import logging
8
+
9
+ from config import Config
10
+ from mcp_server import mcp
11
+ # Handle imports based on how the app is run
12
+ try:
13
+ from mcp_server import mcp
14
+ MCP_AVAILABLE = True
15
+ except ImportError:
16
+ MCP_AVAILABLE = False
17
+ print("⚠️ MCP server not available, running in standalone mode")
18
+
19
+ import mcp_tools
20
+
21
+ # Set up logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Validate configuration on startup
26
+ try:
27
+ Config.validate()
28
+ except ValueError as e:
29
+ logger.error(f"Configuration error: {e}")
30
+ print(f"⚠️ Configuration error: {e}")
31
+ print("Please set the required API keys in your environment variables or .env file")
32
+
33
+ # Global state for search results
34
+ current_results = []
35
+
36
+ async def process_file_handler(file):
37
+ """Handle file upload and processing"""
38
+ if file is None:
39
+ return "Please upload a file", "", "", None
40
+
41
+ try:
42
+ # Process the file
43
+ result = await mcp_tools.process_local_file(file.name)
44
+
45
+ if result.get("success"):
46
+ tags_display = ", ".join(result["tags"])
47
+ return (
48
+ f"✅ Successfully processed: {result['file_name']}",
49
+ result["summary"],
50
+ tags_display,
51
+ gr.update(visible=True, value=create_result_card(result))
52
+ )
53
+ else:
54
+ return f"❌ Error: {result.get('error', 'Unknown error')}", "", "", None
55
+
56
+ except Exception as e:
57
+ logger.error(f"Error in file handler: {str(e)}")
58
+ return f"❌ Error: {str(e)}", "", "", None
59
+
60
+ async def process_url_handler(url):
61
+ """Handle URL processing"""
62
+ if not url:
63
+ return "Please enter a URL", "", "", None
64
+
65
+ try:
66
+ # Process the URL
67
+ result = await mcp_tools.process_web_content(url)
68
+
69
+ if result.get("success"):
70
+ tags_display = ", ".join(result["tags"])
71
+ return (
72
+ f"✅ Successfully processed: {url}",
73
+ result["summary"],
74
+ tags_display,
75
+ gr.update(visible=True, value=create_result_card(result))
76
+ )
77
+ else:
78
+ return f"❌ Error: {result.get('error', 'Unknown error')}", "", "", None
79
+
80
+ except Exception as e:
81
+ logger.error(f"Error in URL handler: {str(e)}")
82
+ return f"❌ Error: {str(e)}", "", "", None
83
+
84
+ async def search_handler(query):
85
+ """Handle semantic search"""
86
+ if not query:
87
+ return [], "Please enter a search query"
88
+
89
+ try:
90
+ # Perform search
91
+ results = await mcp_tools.search_knowledge_base(query, limit=10)
92
+
93
+ if results:
94
+ # Create display cards for each result
95
+ result_cards = []
96
+ for result in results:
97
+ card = f"""
98
+ ### 📄 {result.get('source', 'Unknown Source')}
99
+ **Tags:** {', '.join(result.get('tags', []))}
100
+
101
+ **Summary:** {result.get('summary', 'No summary available')}
102
+
103
+ **Relevance:** {result.get('relevance_score', 0):.2%}
104
+
105
+ ---
106
+ """
107
+ result_cards.append(card)
108
+
109
+ global current_results
110
+ current_results = results
111
+
112
+ return result_cards, f"Found {len(results)} results"
113
+ else:
114
+ return [], "No results found"
115
+
116
+ except Exception as e:
117
+ logger.error(f"Error in search: {str(e)}")
118
+ return [], f"Error: {str(e)}"
119
+
120
+ def create_result_card(result: Dict[str, Any]) -> str:
121
+ """Create a formatted result card"""
122
+ return f"""
123
+ ### 📋 Processing Complete
124
+
125
+ **Document ID:** {result.get('doc_id', 'N/A')}
126
+
127
+ **Source:** {result.get('file_name', result.get('url', 'Unknown'))}
128
+
129
+ **Tags:** {', '.join(result.get('tags', []))}
130
+
131
+ **Summary:** {result.get('summary', 'No summary available')}
132
+
133
+ **Chunks Processed:** {result.get('chunks_processed', 0)}
134
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ # Create Gradio interface
137
+ with gr.Blocks(title="Intelligent Content Organizer - MCP Agent") as demo:
138
+ gr.Markdown("""
139
+ # 🧠 Intelligent Content Organizer
140
+ ### MCP-Powered Knowledge Management System
141
+
142
+ This AI-driven system automatically organizes, enriches, and retrieves your digital content.
143
+ Upload files or provide URLs to build your personal knowledge base with automatic tagging and semantic search.
144
+
145
+ ---
146
+ """)
147
+
148
+ with gr.Tabs():
149
+ # File Processing Tab
150
+ with gr.TabItem("📁 Process Files"):
151
+ with gr.Row():
152
+ with gr.Column():
153
+ file_input = gr.File(
154
+ label="Upload Document",
155
+ file_types=[".pdf", ".txt", ".docx", ".doc", ".html", ".md", ".csv", ".json"]
156
+ )
157
+ file_process_btn = gr.Button("Process File", variant="primary")
158
+
159
+ with gr.Column():
160
+ file_status = gr.Textbox(label="Status", lines=1)
161
+ file_summary = gr.Textbox(label="Generated Summary", lines=3)
162
+ file_tags = gr.Textbox(label="Generated Tags", lines=1)
163
+
164
+ file_result = gr.Markdown(visible=False)
165
+
166
+ # URL Processing Tab
167
+ with gr.TabItem("🌐 Process URLs"):
168
+ with gr.Row():
169
+ with gr.Column():
170
+ url_input = gr.Textbox(
171
+ label="Enter URL",
172
+ placeholder="https://example.com/article"
173
+ )
174
+ url_process_btn = gr.Button("Process URL", variant="primary")
175
+
176
+ with gr.Column():
177
+ url_status = gr.Textbox(label="Status", lines=1)
178
+ url_summary = gr.Textbox(label="Generated Summary", lines=3)
179
+ url_tags = gr.Textbox(label="Generated Tags", lines=1)
180
+
181
+ url_result = gr.Markdown(visible=False)
182
+
183
+ # Search Tab
184
+ with gr.TabItem("🔍 Semantic Search"):
185
+ search_input = gr.Textbox(
186
+ label="Search Query",
187
+ placeholder="Enter your search query...",
188
+ lines=1
189
+ )
190
+ search_btn = gr.Button("Search", variant="primary")
191
+ search_status = gr.Textbox(label="Status", lines=1)
192
+
193
+ search_results = gr.Markdown(label="Search Results")
194
+
195
+ # MCP Server Info Tab
196
+ with gr.TabItem("ℹ️ MCP Server Info"):
197
+ gr.Markdown("""
198
+ ### MCP Server Configuration
199
+
200
+ This Gradio app also functions as an MCP (Model Context Protocol) server, allowing integration with:
201
+ - Claude Desktop
202
+ - Cursor
203
+ - Other MCP-compatible clients
204
+
205
+ **Server Name:** intelligent-content-organizer
206
+
207
+ **Available Tools:**
208
+ - `process_file`: Process local files and extract content
209
+ - `process_url`: Fetch and process web content
210
+ - `semantic_search`: Search across stored documents
211
+ - `get_document_summary`: Get detailed document information
212
+
213
+ **To use as MCP server:**
214
+ 1. Add this server to your MCP client configuration
215
+ 2. Use the tools listed above to interact with your knowledge base
216
+ 3. All processed content is automatically indexed for semantic search
217
+
218
+ **Tags:** mcp-server-track
219
+ """)
220
+
221
+ # Event handlers
222
+ file_process_btn.click(
223
+ fn=lambda x: asyncio.run(process_file_handler(x)),
224
+ inputs=[file_input],
225
+ outputs=[file_status, file_summary, file_tags, file_result]
226
+ )
227
+
228
+ url_process_btn.click(
229
+ fn=lambda x: asyncio.run(process_url_handler(x)),
230
+ inputs=[url_input],
231
+ outputs=[url_status, url_summary, url_tags, url_result]
232
+ )
233
+
234
+ search_btn.click(
235
+ fn=lambda x: asyncio.run(search_handler(x)),
236
+ inputs=[search_input],
237
+ outputs=[search_results, search_status]
238
+ )
239
+
240
+ # Launch configuration
241
  if __name__ == "__main__":
242
+ # Check if running as MCP server
243
+ import sys
244
+ if "--mcp" in sys.argv:
245
+ # Run as MCP server
246
+ import asyncio
247
+ asyncio.run(mcp.run())
248
+ else:
249
+ # Run as Gradio app
250
+ demo.launch(
251
+ server_name="0.0.0.0",
252
+ share=False,
253
+ show_error=True
254
+ )
config.py CHANGED
@@ -1,7 +1,124 @@
1
- # config.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  from dotenv import load_dotenv
4
- load_dotenv() # loads from .env if present
5
- MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
6
- CLAUDE_API_KEY = os.environ.get("CLAUDE_API_KEY")
7
- BRAVE_API_KEY = os.environ.get("BRAVE_API_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # from dotenv import load_dotenv
3
+
4
+ # # Load environment variables
5
+ # load_dotenv()
6
+
7
+ # class Config:
8
+ # """Configuration management for API keys and settings"""
9
+
10
+ # # API Keys (from environment variables)
11
+ # MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
12
+ # BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
13
+ # UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY", "")
14
+ # ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
15
+
16
+ # # ChromaDB Settings
17
+ # CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
18
+ # CHROMA_COLLECTION_NAME = "knowledge_base"
19
+
20
+ # # MCP Server Settings
21
+ # MCP_SERVER_NAME = "intelligent-content-organizer"
22
+ # MCP_SERVER_VERSION = "1.0.0"
23
+
24
+ # # Processing Settings
25
+ # MAX_FILE_SIZE_MB = 50
26
+ # SUPPORTED_FILE_TYPES = [
27
+ # ".pdf", ".txt", ".docx", ".doc", ".html", ".md",
28
+ # ".csv", ".json", ".xml", ".epub", ".rtf"
29
+ # ]
30
+
31
+ # # Model Settings
32
+ # MISTRAL_MODEL = "mistral-small-latest"
33
+ # CLAUDE_MODEL = "claude-3-haiku-20240307"
34
+ # EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
35
+
36
+ # @classmethod
37
+ # def validate(cls):
38
+ # """Validate that all required API keys are set"""
39
+ # missing_keys = []
40
+ # if not cls.MISTRAL_API_KEY:
41
+ # missing_keys.append("MISTRAL_API_KEY")
42
+ # if not cls.BRAVE_API_KEY:
43
+ # missing_keys.append("BRAVE_API_KEY")
44
+ # if not cls.UNSTRUCTURED_API_KEY:
45
+ # missing_keys.append("UNSTRUCTURED_API_KEY")
46
+
47
+ # if missing_keys:
48
+ # raise ValueError(f"Missing required API keys: {', '.join(missing_keys)}")
49
+
50
+ # return True
51
+
52
+
53
  import os
54
  from dotenv import load_dotenv
55
+
56
+ # Load environment variables
57
+ load_dotenv()
58
+
59
+ class Config:
60
+ """Configuration management for API keys and settings"""
61
+
62
+ # API Keys - Only 2 needed, both with free tiers!
63
+ MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
64
+ ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
65
+
66
+ # ChromaDB Settings (completely free local storage)
67
+ CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
68
+ CHROMA_COLLECTION_NAME = "knowledge_base"
69
+
70
+ # MCP Server Settings
71
+ MCP_SERVER_NAME = "intelligent-content-organizer"
72
+ MCP_SERVER_VERSION = "1.0.0"
73
+ MCP_SERVER_DESCRIPTION = "AI-powered knowledge management with automatic tagging and semantic search"
74
+
75
+ # Processing Settings
76
+ MAX_FILE_SIZE_MB = 50
77
+ SUPPORTED_FILE_TYPES = [
78
+ ".pdf", ".txt", ".docx", ".doc", ".html", ".htm",
79
+ ".md", ".csv", ".json", ".xml", ".rtf"
80
+ ]
81
+
82
+ # Model Settings
83
+ MISTRAL_MODEL = "mistral-small-latest" # Free tier available
84
+ CLAUDE_MODEL = "claude-3-haiku-20240307" # Free tier available
85
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Completely free
86
+
87
+ # Feature Flags - Enable/disable based on API availability
88
+ USE_MISTRAL_FOR_TAGS = bool(MISTRAL_API_KEY)
89
+ USE_CLAUDE_FOR_SUMMARY = bool(ANTHROPIC_API_KEY)
90
+
91
+ # Free alternatives settings
92
+ ENABLE_FREE_FALLBACKS = True # Always use free methods when APIs fail
93
+
94
+ @classmethod
95
+ def validate(cls):
96
+ """Validate configuration - now more flexible"""
97
+ warnings = []
98
+
99
+ if not cls.MISTRAL_API_KEY:
100
+ warnings.append("MISTRAL_API_KEY not set - will use free tag generation")
101
+
102
+ if not cls.ANTHROPIC_API_KEY:
103
+ warnings.append("ANTHROPIC_API_KEY not set - will use free summarization")
104
+
105
+ if warnings:
106
+ print("⚠️ Configuration warnings:")
107
+ for warning in warnings:
108
+ print(f" - {warning}")
109
+ print("\n✅ The app will still work using free alternatives!")
110
+ else:
111
+ print("✅ All API keys configured")
112
+
113
+ return True
114
+
115
+ @classmethod
116
+ def get_status(cls):
117
+ """Get configuration status for display"""
118
+ return {
119
+ "mistral_configured": bool(cls.MISTRAL_API_KEY),
120
+ "anthropic_configured": bool(cls.ANTHROPIC_API_KEY),
121
+ "free_fallbacks_enabled": cls.ENABLE_FREE_FALLBACKS,
122
+ "supported_formats": cls.SUPPORTED_FILE_TYPES,
123
+ "embedding_model": cls.EMBEDDING_MODEL
124
+ }
core/__init__.py DELETED
File without changes
core/agent.py DELETED
@@ -1,17 +0,0 @@
1
- import json
2
- from core.storage import search_documents
3
- # For Q&A we can use a simple retrieval + QA pipeline (stubbed here)
4
- # In a real app, you might use LangChain or a HuggingFace question-answering model.
5
-
6
- def answer_question(question: str) -> str:
7
- """
8
- Agent: retrieve relevant docs and answer the question.
9
- """
10
- # Retrieve top documents
11
- results = search_documents(question, top_k=3)
12
- doc_texts = results.get("documents", [[]])[0]
13
- combined = " ".join(doc_texts)
14
- # Stub: just echo the question and number of docs
15
- if not combined.strip():
16
- return "No relevant documents found."
17
- return f"Answered question: '{question}' (based on {len(doc_texts)} documents)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/ai_enrichment.py DELETED
@@ -1,41 +0,0 @@
1
- # core/ai_enrichment.py
2
-
3
- from mistralai import Mistral
4
- import config
5
-
6
- def generate_tags(text: str) -> list[str]:
7
- """
8
- Use Mistral AI to generate 5-7 relevant tags for the text.
9
- """
10
- with Mistral(api_key=config.MISTRAL_API_KEY) as client:
11
- response = client.chat.complete(
12
- model="mistral-small-latest",
13
- messages=[{
14
- "role": "user",
15
- "content": f"Generate 5-7 relevant tags (comma-separated) for the following text:\n\n{text}"
16
- }]
17
- )
18
- try:
19
- content = response["choices"][0]["message"]["content"]
20
- except (KeyError, IndexError):
21
- return []
22
- tags = [tag.strip() for tag in content.split(",") if tag.strip()]
23
- return tags
24
-
25
- def summarize_text(text: str) -> str:
26
- """
27
- Use Mistral AI to generate a concise summary of the text.
28
- """
29
- with Mistral(api_key=config.MISTRAL_API_KEY) as client:
30
- response = client.chat.complete(
31
- model="mistral-small-latest",
32
- messages=[{
33
- "role": "user",
34
- "content": f"Summarize the following text in a concise manner:\n\n{text}"
35
- }]
36
- )
37
- try:
38
- summary = response["choices"][0]["message"]["content"].strip()
39
- except (KeyError, IndexError):
40
- return ""
41
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/components.py DELETED
@@ -1,23 +0,0 @@
1
- import gradio as gr
2
-
3
- class DocumentViewer(gr.components.Component):
4
- """
5
- Custom Gradio component for document preview and tag editing.
6
- (Stub implementation)
7
- """
8
- def __init__(self, label=None):
9
- super().__init__(label=label, value=None)
10
- self.visible = True
11
- self.interactive = False
12
-
13
- def preprocess(self, x):
14
- # Input is a file path (or object); just return as-is
15
- return x
16
-
17
- def postprocess(self, x):
18
- # x is the raw document text; display first few lines as preview
19
- if not x:
20
- return ""
21
- lines = x.splitlines()
22
- preview = "\n".join(lines[:10])
23
- return preview
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/components.pyi DELETED
@@ -1,29 +0,0 @@
1
- import gradio as gr
2
- from gradio.events import Dependency
3
-
4
- class DocumentViewer(gr.components.Component):
5
- """
6
- Custom Gradio component for document preview and tag editing.
7
- (Stub implementation)
8
- """
9
- def __init__(self, label=None):
10
- super().__init__(label=label, value=None)
11
- self.visible = True
12
- self.interactive = False
13
-
14
- def preprocess(self, x):
15
- # Input is a file path (or object); just return as-is
16
- return x
17
-
18
- def postprocess(self, x):
19
- # x is the raw document text; display first few lines as preview
20
- if not x:
21
- return ""
22
- lines = x.splitlines()
23
- preview = "\n".join(lines[:10])
24
- return preview
25
- from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING
26
- from gradio.blocks import Block
27
- if TYPE_CHECKING:
28
- from gradio.components import Timer
29
- from gradio.components.base import Component
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/database.py DELETED
@@ -1,81 +0,0 @@
1
- # core/database.py
2
-
3
- import chromadb
4
- from chromadb.config import Settings
5
- from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
6
- import config
7
-
8
- def init_chroma():
9
- """
10
- Initialize a ChromaDB client and collection with an embedding function.
11
- Uses OpenAI embeddings if API key is available, otherwise a dummy embedding.
12
- """
13
- # Initialize Chroma client (in-memory by default)
14
- client = chromadb.Client(Settings())
15
-
16
- # Determine embedding function
17
- embedding_fn = None
18
- try:
19
- openai_key = config.OPENAI_API_KEY
20
- except AttributeError:
21
- openai_key = None
22
-
23
- if openai_key:
24
- embedding_fn = OpenAIEmbeddingFunction(
25
- api_key=openai_key,
26
- model_name="text-embedding-ada-002"
27
- )
28
- else:
29
- # Dummy embedding: one-dimensional embedding based on text length
30
- class DummyEmbedding:
31
- def __call__(self, texts):
32
- return [[float(len(text))] for text in texts]
33
- embedding_fn = DummyEmbedding()
34
-
35
- # Create or get collection named "documents"
36
- collection = client.get_or_create_collection(
37
- name="documents",
38
- embedding_function=embedding_fn
39
- )
40
- return collection
41
-
42
- def add_document(collection, doc_id: str, text: str, tags: list[str], summary: str, source: str):
43
- """
44
- Add a document to the ChromaDB collection with metadata.
45
- """
46
- metadata = {"tags": tags, "summary": summary, "source": source}
47
- # Add document (Chroma will generate embeddings using the collection's embedding function)
48
- collection.add(
49
- ids=[doc_id],
50
- documents=[text],
51
- metadatas=[metadata]
52
- )
53
-
54
- def search_documents(collection, query: str, top_n: int = 5) -> list[dict]:
55
- """
56
- Search for semantically similar documents in the collection.
57
- Returns top N results with their metadata.
58
- """
59
- results = collection.query(
60
- query_texts=[query],
61
- n_results=top_n,
62
- include=["metadatas", "documents", "distances"]
63
- )
64
- hits = []
65
- # Extract the results from the Chroma query response
66
- ids = results.get("ids", [[]])[0]
67
- documents = results.get("documents", [[]])[0]
68
- metadatas = results.get("metadatas", [[]])[0]
69
- distances = results.get("distances", [[]])[0]
70
-
71
- for i, doc_id in enumerate(ids):
72
- hit = {
73
- "id": doc_id,
74
- "score": distances[i] if i < len(distances) else None,
75
- "source": metadatas[i].get("source") if i < len(metadatas) else None,
76
- "tags": metadatas[i].get("tags") if i < len(metadatas) else None,
77
- "summary": metadatas[i].get("summary") if i < len(metadatas) else None,
78
- "document": documents[i] if i < len(documents) else None
79
- }
80
- hits.append(hit)
81
- return hits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/parser.py DELETED
@@ -1,30 +0,0 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- from unstructured.partition.auto import partition
4
-
5
- def parse_document(file_path: str) -> str:
6
- """
7
- Parse a document file (PDF, DOCX, TXT, etc.) into text using Unstructured.
8
- """
9
- try:
10
- elements = partition(file_path)
11
- # Combine text elements into a single string
12
- text = "\n".join([elem.text for elem in elements if elem.text])
13
- return text
14
- except Exception as e:
15
- return f"Error parsing document: {e}"
16
-
17
- def parse_url(url: str) -> str:
18
- """
19
- Fetch and parse webpage content at the given URL.
20
- """
21
- try:
22
- headers = {"User-Agent": "Mozilla/5.0"}
23
- response = requests.get(url, headers=headers, timeout=10)
24
- soup = BeautifulSoup(response.text, 'html.parser')
25
- # Extract visible text from paragraphs
26
- paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])
27
- text = "\n".join([p.get_text() for p in paragraphs])
28
- return text
29
- except Exception as e:
30
- return f"Error fetching URL: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/processing.py DELETED
@@ -1,42 +0,0 @@
1
- # core/processing.py
2
-
3
- import requests
4
- from unstructured.partition.html import partition_html
5
- from unstructured.partition.auto import partition
6
- import config
7
-
8
- def fetch_web_content(url: str) -> str:
9
- """
10
- Fetch and parse web content from the given URL into structured text.
11
- """
12
- try:
13
- # Use Unstructured to fetch and parse HTML content directly from the URL
14
- elements = partition_html(url=url)
15
- text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
16
- return text
17
- except Exception:
18
- # If Unstructured parsing fails, attempt a simple HTTP GET as a fallback
19
- try:
20
- response = requests.get(url)
21
- response.raise_for_status()
22
- html_text = response.text
23
- # Attempt parsing the fetched HTML text
24
- elements = partition(filename=None, file=html_text)
25
- text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
26
- return text
27
- except Exception:
28
- # On failure, return empty string
29
- return ""
30
-
31
- def parse_local_file(file_path: str) -> str:
32
- """
33
- Parse a local file into structured text using the Unstructured library.
34
- Supports various file formats (e.g., PDF, DOCX, TXT).
35
- """
36
- try:
37
- elements = partition(filename=file_path)
38
- text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
39
- return text
40
- except Exception:
41
- # Return empty string on failure
42
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/summarizer.py DELETED
@@ -1,25 +0,0 @@
1
- def summarize_content(text: str) -> str:
2
- """
3
- Generate a summary of the text. (This is a stub simulating a Claude 3 Haiku call.)
4
- """
5
- # In a real app, you might call the Anthropic Claude 3 API here.
6
- # We'll return the first 100 characters as a "summary".
7
- summary = text.strip().replace("\n", " ")
8
- summary = summary[:100] + ("..." if len(summary) > 100 else "")
9
- return f"Summary: {summary}"
10
-
11
- def tag_content(text: str) -> list:
12
- """
13
- Generate tags for the text. (This is a stub simulating a Mistral 7B call.)
14
- """
15
- # In a real app, you might call a tag-generation model or use embeddings.
16
- # We'll simulate by picking some keywords.
17
- common_words = ["data", "analysis", "python", "research", "AI"]
18
- tags = []
19
- lower = text.lower()
20
- for word in common_words:
21
- if word in lower:
22
- tags.append(word)
23
- if not tags:
24
- tags = ["general"]
25
- return tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/utils.py DELETED
@@ -1,23 +0,0 @@
1
- # core/utils.py
2
-
3
- import re
4
- from datetime import datetime
5
- import hashlib
6
-
7
- def clean_text(text: str) -> str:
8
- """
9
- Clean and normalize text by removing extra whitespace.
10
- """
11
- if not text:
12
- return ""
13
- # Collapse multiple whitespace into single spaces and strip ends
14
- cleaned = re.sub(r'\s+', ' ', text)
15
- return cleaned.strip()
16
-
17
- def generate_doc_id(source: str) -> str:
18
- """
19
- Generate a unique document ID based on source identifier and timestamp.
20
- """
21
- timestamp = datetime.now().isoformat()
22
- raw_id = f"{source}-{timestamp}"
23
- return hashlib.md5(raw_id.encode()).hexdigest()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/article_url.txt DELETED
File without changes
data/document1.pdf DELETED
File without changes
data/sample_note.txt DELETED
File without changes
mcp_server.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from mcp.server.fastmcp import FastMCP
2
+ # import json
3
+ # from typing import Dict, List, Any
4
+ # import logging
5
+
6
+ # # Set up logging
7
+ # logging.basicConfig(level=logging.INFO)
8
+ # logger = logging.getLogger(__name__)
9
+
10
+ # # Initialize MCP server
11
+ # mcp = FastMCP("intelligent-content-organizer")
12
+
13
+ # @mcp.tool()
14
+ # async def process_file(file_path: str) -> Dict[str, Any]:
15
+ # """
16
+ # Process a local file and extract content, generate tags, and create embeddings
17
+
18
+ # Args:
19
+ # file_path: Path to the file to process
20
+
21
+ # Returns:
22
+ # Dictionary containing processed content, tags, and metadata
23
+ # """
24
+ # try:
25
+ # from mcp_tools import process_local_file
26
+ # result = await process_local_file(file_path)
27
+ # return result
28
+ # except Exception as e:
29
+ # logger.error(f"Error processing file: {str(e)}")
30
+ # return {"error": str(e)}
31
+
32
+ # @mcp.tool()
33
+ # async def process_url(url: str) -> Dict[str, Any]:
34
+ # """
35
+ # Fetch and process content from a URL
36
+
37
+ # Args:
38
+ # url: URL to fetch and process
39
+
40
+ # Returns:
41
+ # Dictionary containing processed content, tags, and metadata
42
+ # """
43
+ # try:
44
+ # from mcp_tools import process_web_content
45
+ # result = await process_web_content(url)
46
+ # return result
47
+ # except Exception as e:
48
+ # logger.error(f"Error processing URL: {str(e)}")
49
+ # return {"error": str(e)}
50
+
51
+ # @mcp.tool()
52
+ # async def semantic_search(query: str, limit: int = 5) -> List[Dict[str, Any]]:
53
+ # """
54
+ # Perform semantic search across stored documents
55
+
56
+ # Args:
57
+ # query: Search query
58
+ # limit: Maximum number of results to return
59
+
60
+ # Returns:
61
+ # List of relevant documents with metadata
62
+ # """
63
+ # try:
64
+ # from mcp_tools import search_knowledge_base
65
+ # results = await search_knowledge_base(query, limit)
66
+ # return results
67
+ # except Exception as e:
68
+ # logger.error(f"Error performing search: {str(e)}")
69
+ # return [{"error": str(e)}]
70
+
71
+ # @mcp.tool()
72
+ # async def get_document_summary(doc_id: str) -> Dict[str, Any]:
73
+ # """
74
+ # Get summary and metadata for a specific document
75
+
76
+ # Args:
77
+ # doc_id: Document ID in the knowledge base
78
+
79
+ # Returns:
80
+ # Document summary and metadata
81
+ # """
82
+ # try:
83
+ # from mcp_tools import get_document_details
84
+ # result = await get_document_details(doc_id)
85
+ # return result
86
+ # except Exception as e:
87
+ # logger.error(f"Error getting document summary: {str(e)}")
88
+ # return {"error": str(e)}
89
+
90
+ # # Server metadata
91
+ # @mcp.resource("server_info")
92
+ # async def get_server_info() -> Dict[str, Any]:
93
+ # """Get information about this MCP server"""
94
+ # return {
95
+ # "name": "Intelligent Content Organizer",
96
+ # "version": "1.0.0",
97
+ # "description": "AI-powered knowledge management system with automatic tagging and semantic search",
98
+ # "capabilities": [
99
+ # "File processing (20+ formats)",
100
+ # "Web content extraction",
101
+ # "Automatic tagging",
102
+ # "Semantic search",
103
+ # "Document summarization"
104
+ # ]
105
+ # }
106
+
107
+ # if __name__ == "__main__":
108
+ # # Run the MCP server
109
+ # import asyncio
110
+ # asyncio.run(mcp.run())
111
+
112
+ from mcp.server.fastmcp import FastMCP
113
+ import json
114
+ from typing import Dict, List, Any
115
+ import logging
116
+
117
+ # Set up logging
118
+ logging.basicConfig(level=logging.INFO)
119
+ logger = logging.getLogger(__name__)
120
+
121
+ # Initialize MCP server
122
+ mcp = FastMCP("intelligent-content-organizer")
123
+
124
+ @mcp.tool()
125
+ async def process_file(file_path: str) -> Dict[str, Any]:
126
+ """
127
+ Process a local file and extract content, generate tags, and create embeddings
128
+ """
129
+ try:
130
+ from mcp_tools import process_local_file
131
+ result = await process_local_file(file_path)
132
+ return result
133
+ except Exception as e:
134
+ logger.error(f"Error processing file: {str(e)}")
135
+ return {"error": str(e)}
136
+
137
+ @mcp.tool()
138
+ async def process_url(url: str) -> Dict[str, Any]:
139
+ """
140
+ Fetch and process content from a URL
141
+ """
142
+ try:
143
+ from mcp_tools import process_web_content
144
+ result = await process_web_content(url)
145
+ return result
146
+ except Exception as e:
147
+ logger.error(f"Error processing URL: {str(e)}")
148
+ return {"error": str(e)}
149
+
150
+ @mcp.tool()
151
+ async def semantic_search(query: str, limit: int = 5) -> List[Dict[str, Any]]:
152
+ """
153
+ Perform semantic search across stored documents
154
+ """
155
+ try:
156
+ from mcp_tools import search_knowledge_base
157
+ results = await search_knowledge_base(query, limit)
158
+ return results
159
+ except Exception as e:
160
+ logger.error(f"Error performing search: {str(e)}")
161
+ return [{"error": str(e)}]
162
+
163
+ @mcp.tool()
164
+ async def get_document_summary(doc_id: str) -> Dict[str, Any]:
165
+ """
166
+ Get summary and metadata for a specific document
167
+ """
168
+ try:
169
+ from mcp_tools import get_document_details
170
+ result = await get_document_details(doc_id)
171
+ return result
172
+ except Exception as e:
173
+ logger.error(f"Error getting document summary: {str(e)}")
174
+ return {"error": str(e)}
175
+
176
+ @mcp.tool()
177
+ async def get_server_info() -> Dict[str, Any]:
178
+ """
179
+ Get information about this MCP server
180
+ """
181
+ return {
182
+ "name": "Intelligent Content Organizer",
183
+ "version": "1.0.0",
184
+ "description": "AI-powered knowledge management system with automatic tagging and semantic search",
185
+ "capabilities": [
186
+ "File processing (20+ formats)",
187
+ "Web content extraction",
188
+ "Automatic tagging",
189
+ "Semantic search",
190
+ "Document summarization"
191
+ ],
192
+ "tools": [
193
+ {
194
+ "name": "process_file",
195
+ "description": "Process local files and extract content"
196
+ },
197
+ {
198
+ "name": "process_url",
199
+ "description": "Fetch and process web content"
200
+ },
201
+ {
202
+ "name": "semantic_search",
203
+ "description": "Search across stored documents"
204
+ },
205
+ {
206
+ "name": "get_document_summary",
207
+ "description": "Get document details"
208
+ },
209
+ {
210
+ "name": "get_server_info",
211
+ "description": "Get server information"
212
+ }
213
+ ]
214
+ }
215
+
216
+ if __name__ == "__main__":
217
+ # Run the MCP server
218
+ import asyncio
219
+ asyncio.run(mcp.run())
mcp_tools.py CHANGED
@@ -1,122 +1,592 @@
1
- # # mcp_tools.py
2
-
3
- # from fastmcp import FastMCP
4
- # import core.processing as processing
5
- # import core.ai_enrichment as ai_enrichment
6
- # import core.database as db
7
- # import core.utils as utils
8
-
9
- # # Initialize the FastMCP server instance
10
- # mcp = FastMCP(name="IntelligentContentOrganizer")
11
-
12
- # # Initialize the ChromaDB collection (shared for all tools)
13
- # collection = db.init_chroma()
14
-
15
- # @mcp.tool()
16
- # def process_content(url: str) -> dict:
17
- # """
18
- # Process content from a web URL: fetch, enrich, and store.
19
- # Returns document ID, tags, summary, and source.
20
- # """
21
- # content = processing.fetch_web_content(url)
22
- # text = utils.clean_text(content)
23
- # tags = ai_enrichment.generate_tags(text) if text else []
24
- # summary = ai_enrichment.summarize_text(text) if text else ""
25
- # doc_id = utils.generate_doc_id(url)
26
- # # Add the document to the database collection
27
- # db.add_document(collection, doc_id, text, tags, summary, source=url)
28
- # return {"id": doc_id, "tags": tags, "summary": summary, "source": url}
29
-
30
- # @mcp.tool()
31
- # def upload_local_file(file_path: str) -> dict:
32
- # """
33
- # Process a local file: parse, enrich, and store.
34
- # Returns document ID, tags, summary, and source.
35
- # """
36
- # content = processing.parse_local_file(file_path)
37
- # text = utils.clean_text(content)
38
- # tags = ai_enrichment.generate_tags(text) if text else []
39
- # summary = ai_enrichment.summarize_text(text) if text else ""
40
- # doc_id = utils.generate_doc_id(file_path)
41
- # db.add_document(collection, doc_id, text, tags, summary, source=file_path)
42
- # return {"id": doc_id, "tags": tags, "summary": summary, "source": file_path}
43
-
44
- # @mcp.tool()
45
- # def semantic_search(query: str, top_n: int = 5) -> list:
46
- # """
47
- # Search for documents semantically similar to the query.
48
- # Returns top N results as a list of dictionaries.
49
- # """
50
- # results = db.search_documents(collection, query, top_n)
51
- # return results
52
-
53
-
54
- from fastmcp import FastMCP
55
- from core.parser import parse_document, parse_url
56
- from core.summarizer import summarize_content, tag_content
57
- from core.storage import add_document, search_documents
58
- from core.agent import answer_question
59
  import json
 
 
 
 
 
 
60
 
61
- mcp = FastMCP("IntelligentContentOrganizer_MCP")
62
-
63
- @mcp.tool(name="parse_document")
64
- def mcp_parse_document(file_path: str) -> str:
65
- """
66
- MCP tool: Parse a document file and return extracted text.
67
- """
68
- text = parse_document(file_path)
69
- return text
70
-
71
- @mcp.tool(name="parse_url")
72
- def mcp_parse_url(url: str) -> str:
73
- """
74
- MCP tool: Fetch and parse webpage content from a URL.
75
- """
76
- text = parse_url(url)
77
- return text
78
-
79
- @mcp.tool(name="summarize")
80
- def mcp_summarize(text: str) -> str:
81
- """
82
- MCP tool: Generate a summary of the provided text.
83
- """
84
- return summarize_content(text)
85
-
86
- @mcp.tool(name="tag")
87
- def mcp_tag(text: str) -> str:
88
- """
89
- MCP tool: Generate tags for the provided text (JSON list).
90
- """
91
- tags = tag_content(text)
92
- return json.dumps(tags)
93
-
94
- @mcp.tool(name="add_to_db")
95
- def mcp_add_to_db(doc_id: str, text: str, metadata_json: str) -> str:
96
- """
97
- MCP tool: Add a document to ChromaDB with given ID and metadata (JSON).
98
- """
99
- metadata = json.loads(metadata_json)
100
- add_document(doc_id, text, metadata)
101
- return "Document added with ID: " + doc_id
102
-
103
- @mcp.tool(name="search_db")
104
- def mcp_search_db(query: str, top_k: int = 5) -> str:
105
- """
106
- MCP tool: Search documents using a query (semantic search). Returns JSON results.
107
- """
108
- results = search_documents(query, top_k=top_k)
109
- return json.dumps(results)
110
-
111
- @mcp.tool(name="answer_question")
112
- def mcp_answer_question(question: str) -> str:
113
- """
114
- MCP tool: Answer a question using the agentic workflow.
115
- """
116
- answer = answer_question(question)
117
- return answer
118
-
119
- if __name__ == "__main__":
120
- # Run the MCP server (streamable HTTP for web integration:contentReference[oaicite:6]{index=6})
121
- mcp.run(transport="streamable-http", host="0.0.0.0", port=7861, path="/mcp")
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import aiohttp
3
+ import chromadb
4
+ from chromadb.utils import embedding_functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import json
6
+ import logging
7
+ from typing import Dict, List, Any, Optional
8
+ from datetime import datetime
9
+ import hashlib
10
+ from pathlib import Path
11
+ import requests
12
 
13
+ # Document processing libraries (all free)
14
+ import PyPDF2
15
+ import docx
16
+ from bs4 import BeautifulSoup
17
+ import pandas as pd
18
+ import markdown
19
+ import xml.etree.ElementTree as ET
20
+ from newspaper import Article
21
+ import trafilatura
22
+ from duckduckgo_search import DDGS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # AI libraries
25
+ from config import Config
26
+ from mistralai.client import MistralClient
27
+ import anthropic
28
+
29
+ # Set up logging
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Initialize AI clients
33
+ mistral_client = MistralClient(api_key=Config.MISTRAL_API_KEY) if Config.MISTRAL_API_KEY else None
34
+ anthropic_client = anthropic.Anthropic(api_key=Config.ANTHROPIC_API_KEY) if Config.ANTHROPIC_API_KEY else None
35
+
36
+ # Initialize ChromaDB
37
+ chroma_client = chromadb.PersistentClient(path=Config.CHROMA_DB_PATH)
38
+ embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
39
+ model_name=Config.EMBEDDING_MODEL
40
+ )
41
+
42
+ # Get or create collection
43
+ try:
44
+ collection = chroma_client.get_collection(
45
+ name=Config.CHROMA_COLLECTION_NAME,
46
+ embedding_function=embedding_function
47
+ )
48
+ except:
49
+ collection = chroma_client.create_collection(
50
+ name=Config.CHROMA_COLLECTION_NAME,
51
+ embedding_function=embedding_function
52
+ )
53
+
54
+ class DocumentProcessor:
55
+ """Free document processing without Unstructured API"""
56
+
57
+ @staticmethod
58
+ def extract_text_from_pdf(file_path: str) -> str:
59
+ """Extract text from PDF files"""
60
+ text = ""
61
+ try:
62
+ with open(file_path, 'rb') as file:
63
+ pdf_reader = PyPDF2.PdfReader(file)
64
+ for page_num in range(len(pdf_reader.pages)):
65
+ page = pdf_reader.pages[page_num]
66
+ text += page.extract_text() + "\n"
67
+ except Exception as e:
68
+ logger.error(f"Error reading PDF: {e}")
69
+ return text
70
+
71
+ @staticmethod
72
+ def extract_text_from_docx(file_path: str) -> str:
73
+ """Extract text from DOCX files"""
74
+ try:
75
+ doc = docx.Document(file_path)
76
+ text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
77
+ return text
78
+ except Exception as e:
79
+ logger.error(f"Error reading DOCX: {e}")
80
+ return ""
81
+
82
+ @staticmethod
83
+ def extract_text_from_html(file_path: str) -> str:
84
+ """Extract text from HTML files"""
85
+ try:
86
+ with open(file_path, 'r', encoding='utf-8') as file:
87
+ soup = BeautifulSoup(file.read(), 'html.parser')
88
+ # Remove script and style elements
89
+ for script in soup(["script", "style"]):
90
+ script.extract()
91
+ text = soup.get_text()
92
+ lines = (line.strip() for line in text.splitlines())
93
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
94
+ text = '\n'.join(chunk for chunk in chunks if chunk)
95
+ return text
96
+ except Exception as e:
97
+ logger.error(f"Error reading HTML: {e}")
98
+ return ""
99
+
100
+ @staticmethod
101
+ def extract_text_from_txt(file_path: str) -> str:
102
+ """Extract text from TXT files"""
103
+ try:
104
+ with open(file_path, 'r', encoding='utf-8') as file:
105
+ return file.read()
106
+ except Exception as e:
107
+ logger.error(f"Error reading TXT: {e}")
108
+ return ""
109
+
110
+ @staticmethod
111
+ def extract_text_from_csv(file_path: str) -> str:
112
+ """Extract text from CSV files"""
113
+ try:
114
+ df = pd.read_csv(file_path)
115
+ return df.to_string()
116
+ except Exception as e:
117
+ logger.error(f"Error reading CSV: {e}")
118
+ return ""
119
+
120
+ @staticmethod
121
+ def extract_text_from_json(file_path: str) -> str:
122
+ """Extract text from JSON files"""
123
+ try:
124
+ with open(file_path, 'r', encoding='utf-8') as file:
125
+ data = json.load(file)
126
+ return json.dumps(data, indent=2)
127
+ except Exception as e:
128
+ logger.error(f"Error reading JSON: {e}")
129
+ return ""
130
+
131
+ @staticmethod
132
+ def extract_text_from_markdown(file_path: str) -> str:
133
+ """Extract text from Markdown files"""
134
+ try:
135
+ with open(file_path, 'r', encoding='utf-8') as file:
136
+ md_text = file.read()
137
+ html = markdown.markdown(md_text)
138
+ soup = BeautifulSoup(html, 'html.parser')
139
+ return soup.get_text()
140
+ except Exception as e:
141
+ logger.error(f"Error reading Markdown: {e}")
142
+ return ""
143
+
144
+ @staticmethod
145
+ def extract_text_from_xml(file_path: str) -> str:
146
+ """Extract text from XML files"""
147
+ try:
148
+ tree = ET.parse(file_path)
149
+ root = tree.getroot()
150
+
151
+ def extract_text(element):
152
+ text = element.text or ""
153
+ for child in element:
154
+ text += " " + extract_text(child)
155
+ return text.strip()
156
+
157
+ return extract_text(root)
158
+ except Exception as e:
159
+ logger.error(f"Error reading XML: {e}")
160
+ return ""
161
+
162
+ @classmethod
163
+ def extract_text(cls, file_path: str) -> str:
164
+ """Extract text from any supported file type"""
165
+ path = Path(file_path)
166
+ extension = path.suffix.lower()
167
+
168
+ extractors = {
169
+ '.pdf': cls.extract_text_from_pdf,
170
+ '.docx': cls.extract_text_from_docx,
171
+ '.doc': cls.extract_text_from_docx,
172
+ '.html': cls.extract_text_from_html,
173
+ '.htm': cls.extract_text_from_html,
174
+ '.txt': cls.extract_text_from_txt,
175
+ '.csv': cls.extract_text_from_csv,
176
+ '.json': cls.extract_text_from_json,
177
+ '.md': cls.extract_text_from_markdown,
178
+ '.xml': cls.extract_text_from_xml,
179
+ }
180
+
181
+ extractor = extractors.get(extension, cls.extract_text_from_txt)
182
+ return extractor(file_path)
183
+
184
+ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
185
+ """Split text into chunks with overlap"""
186
+ chunks = []
187
+ start = 0
188
+ text_length = len(text)
189
+
190
+ while start < text_length:
191
+ end = start + chunk_size
192
+ chunk = text[start:end]
193
+
194
+ # Try to find a sentence boundary
195
+ if end < text_length:
196
+ last_period = chunk.rfind('.')
197
+ last_newline = chunk.rfind('\n')
198
+ boundary = max(last_period, last_newline)
199
+
200
+ if boundary > chunk_size // 2:
201
+ chunk = text[start:start + boundary + 1]
202
+ end = start + boundary + 1
203
+
204
+ chunks.append(chunk.strip())
205
+ start = end - overlap
206
+
207
+ return chunks
208
+
209
+ async def fetch_web_content_free(url: str) -> Optional[str]:
210
+ """Fetch content from URL using multiple free methods"""
211
+
212
+ # Method 1: Try newspaper3k (best for articles)
213
+ try:
214
+ article = Article(url)
215
+ article.download()
216
+ article.parse()
217
+
218
+ content = f"{article.title}\n\n{article.text}"
219
+ if len(content) > 100: # Valid content
220
+ return content
221
+ except Exception as e:
222
+ logger.debug(f"Newspaper failed: {e}")
223
+
224
+ # Method 2: Try trafilatura (great for web scraping)
225
+ try:
226
+ downloaded = trafilatura.fetch_url(url)
227
+ content = trafilatura.extract(downloaded)
228
+ if content and len(content) > 100:
229
+ return content
230
+ except Exception as e:
231
+ logger.debug(f"Trafilatura failed: {e}")
232
+
233
+ # Method 3: Basic BeautifulSoup scraping
234
+ try:
235
+ headers = {
236
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
237
+ }
238
+ response = requests.get(url, headers=headers, timeout=10)
239
+
240
+ if response.status_code == 200:
241
+ soup = BeautifulSoup(response.text, 'html.parser')
242
+
243
+ # Remove unwanted elements
244
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
245
+ element.decompose()
246
+
247
+ # Try to find main content
248
+ main_content = None
249
+
250
+ # Common content selectors
251
+ content_selectors = [
252
+ 'main', 'article', '[role="main"]',
253
+ '.content', '#content', '.post', '.entry-content',
254
+ '.article-body', '.story-body'
255
+ ]
256
+
257
+ for selector in content_selectors:
258
+ main_content = soup.select_one(selector)
259
+ if main_content:
260
+ break
261
+
262
+ if not main_content:
263
+ main_content = soup.find('body')
264
+
265
+ if main_content:
266
+ text = main_content.get_text(separator='\n', strip=True)
267
+
268
+ # Get title
269
+ title = soup.find('title')
270
+ title_text = title.get_text() if title else "No title"
271
+
272
+ return f"{title_text}\n\n{text}"
273
+
274
+ except Exception as e:
275
+ logger.error(f"BeautifulSoup failed: {e}")
276
+
277
+ return None
278
+
279
+ async def search_web_free(query: str, num_results: int = 5) -> List[Dict[str, str]]:
280
+ """Search the web using free methods (DuckDuckGo)"""
281
+ try:
282
+ results = []
283
+ with DDGS() as ddgs:
284
+ for r in ddgs.text(query, max_results=num_results):
285
+ results.append({
286
+ 'title': r.get('title', ''),
287
+ 'url': r.get('link', ''),
288
+ 'snippet': r.get('body', '')
289
+ })
290
+
291
+ return results
292
+
293
+ except Exception as e:
294
+ logger.error(f"Search failed: {e}")
295
+ return []
296
+
297
+ # In mcp_tools.py
298
+
299
+ async def generate_tags(content: str) -> List[str]:
300
+ """Generate tags using Mistral AI or fallback to free method"""
301
+ try:
302
+ if mistral_client: # This is MistralClient from mistralai.client
303
+ prompt = f"""Analyze this content and generate 5-7 relevant tags.
304
+ Return only the tags as a comma-separated list.
305
+
306
+ Content: {content[:2000]}...
307
+
308
+ Tags:"""
309
+
310
+ # For mistralai==0.4.2, pass messages as a list of dicts
311
+ response = mistral_client.chat(
312
+ model=Config.MISTRAL_MODEL,
313
+ messages=[{"role": "user", "content": prompt}] # <--- CHANGE HERE
314
+ )
315
+
316
+ tags_text = response.choices[0].message.content.strip()
317
+ tags = [tag.strip() for tag in tags_text.split(",")]
318
+ return tags[:7]
319
+ else:
320
+ # Free fallback: Extract keywords using frequency analysis
321
+ return generate_tags_free(content)
322
+
323
+ except Exception as e:
324
+ logger.error(f"Error generating tags: {str(e)}")
325
+ return generate_tags_free(content)
326
+
327
+ def generate_tags_free(content: str) -> List[str]:
328
+ """Free tag generation using keyword extraction"""
329
+ from collections import Counter
330
+ import re
331
+
332
+ # Simple keyword extraction
333
+ words = re.findall(r'\b[a-z]{4,}\b', content.lower())
334
+
335
+ # Common stop words
336
+ stop_words = {
337
+ 'this', 'that', 'these', 'those', 'what', 'which', 'when', 'where',
338
+ 'who', 'whom', 'whose', 'why', 'how', 'with', 'about', 'against',
339
+ 'between', 'into', 'through', 'during', 'before', 'after', 'above',
340
+ 'below', 'from', 'down', 'out', 'off', 'over', 'under', 'again',
341
+ 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
342
+ 'how', 'all', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
343
+ 'such', 'only', 'same', 'than', 'that', 'have', 'has', 'had',
344
+ 'been', 'being', 'does', 'doing', 'will', 'would', 'could', 'should'
345
+ }
346
+
347
+ # Filter and count words
348
+ filtered_words = [w for w in words if w not in stop_words and len(w) > 4]
349
+ word_counts = Counter(filtered_words)
350
+
351
+ # Get top keywords
352
+ top_keywords = [word for word, _ in word_counts.most_common(7)]
353
+
354
+ return top_keywords if top_keywords else ["untagged"]
355
+
356
+ async def generate_summary(content: str) -> str:
357
+ """Generate summary using Claude or fallback to free method"""
358
+ try:
359
+ if anthropic_client:
360
+ message = anthropic_client.messages.create(
361
+ model=Config.CLAUDE_MODEL,
362
+ max_tokens=300,
363
+ messages=[{
364
+ "role": "user",
365
+ "content": f"Summarize this content in 2-3 sentences:\n\n{content[:4000]}..."
366
+ }]
367
+ )
368
+
369
+ return message.content[0].text.strip()
370
+ else:
371
+ # Free fallback
372
+ return generate_summary_free(content)
373
+
374
+ except Exception as e:
375
+ logger.error(f"Error generating summary: {str(e)}")
376
+ return generate_summary_free(content)
377
+
378
+ def generate_summary_free(content: str) -> str:
379
+ """Free summary generation using simple extraction"""
380
+ sentences = content.split('.')
381
+ # Take first 3 sentences
382
+ summary_sentences = sentences[:3]
383
+ summary = '. '.join(s.strip() for s in summary_sentences if s.strip())
384
+
385
+ if len(summary) > 300:
386
+ summary = summary[:297] + "..."
387
+
388
+ return summary if summary else "Content preview: " + content[:200] + "..."
389
+
390
+ async def process_local_file(file_path: str) -> Dict[str, Any]:
391
+ """Process a local file and store it in the knowledge base"""
392
+ try:
393
+ # Validate file
394
+ path = Path(file_path)
395
+ if not path.exists():
396
+ raise FileNotFoundError(f"File not found: {file_path}")
397
+
398
+ if path.suffix.lower() not in Config.SUPPORTED_FILE_TYPES:
399
+ raise ValueError(f"Unsupported file type: {path.suffix}")
400
+
401
+ # Extract text using free methods
402
+ full_text = DocumentProcessor.extract_text(file_path)
403
+
404
+ if not full_text:
405
+ raise ValueError("No text could be extracted from the file")
406
+
407
+ # Generate document ID
408
+ doc_id = hashlib.md5(f"{path.name}_{datetime.now().isoformat()}".encode()).hexdigest()
409
+
410
+ # Generate tags
411
+ tags = await generate_tags(full_text[:3000])
412
+
413
+ # Generate summary
414
+ summary = await generate_summary(full_text[:5000])
415
+
416
+ # Chunk the text
417
+ chunks = chunk_text(full_text, chunk_size=1000, overlap=100)
418
+ chunks = chunks[:10] # Limit chunks for demo
419
+
420
+ # Store in ChromaDB
421
+ chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
422
+
423
+ metadata = {
424
+ "source": str(path),
425
+ "file_name": path.name,
426
+ "file_type": path.suffix,
427
+ "processed_at": datetime.now().isoformat(),
428
+ "tags": ", ".join(tags),
429
+ "summary": summary,
430
+ "doc_id": doc_id
431
+ }
432
+
433
+ collection.add(
434
+ documents=chunks,
435
+ ids=chunk_ids,
436
+ metadatas=[metadata for _ in chunks]
437
+ )
438
+
439
+ return {
440
+ "success": True,
441
+ "doc_id": doc_id,
442
+ "file_name": path.name,
443
+ "tags": tags,
444
+ "summary": summary,
445
+ "chunks_processed": len(chunks),
446
+ "metadata": metadata
447
+ }
448
+
449
+ except Exception as e:
450
+ logger.error(f"Error processing file: {str(e)}")
451
+ return {
452
+ "success": False,
453
+ "error": str(e)
454
+ }
455
+
456
+ async def process_web_content(url_or_query: str) -> Dict[str, Any]:
457
+ """Process web content from URL or search query"""
458
+ try:
459
+ # Check if it's a URL or search query
460
+ is_url = url_or_query.startswith(('http://', 'https://'))
461
+
462
+ if is_url:
463
+ content = await fetch_web_content_free(url_or_query)
464
+ source = url_or_query
465
+ else:
466
+ # It's a search query
467
+ search_results = await search_web_free(url_or_query, num_results=3)
468
+ if not search_results:
469
+ raise ValueError("No search results found")
470
+
471
+ # Process the first result
472
+ first_result = search_results[0]
473
+ content = await fetch_web_content_free(first_result['url'])
474
+ source = first_result['url']
475
+
476
+ # Add search context
477
+ content = f"Search Query: {url_or_query}\n\n{first_result['title']}\n\n{content}"
478
+
479
+ if not content:
480
+ raise ValueError("Failed to fetch content")
481
+
482
+ # Generate document ID
483
+ doc_id = hashlib.md5(f"{source}_{datetime.now().isoformat()}".encode()).hexdigest()
484
+
485
+ # Generate tags
486
+ tags = await generate_tags(content[:3000])
487
+
488
+ # Generate summary
489
+ summary = await generate_summary(content[:5000])
490
+
491
+ # Chunk the content
492
+ chunks = chunk_text(content, chunk_size=1000, overlap=100)
493
+ chunks = chunks[:10] # Limit for demo
494
+
495
+ # Store in ChromaDB
496
+ chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
497
+
498
+ metadata = {
499
+ "source": source,
500
+ "url": source if is_url else f"Search: {url_or_query}",
501
+ "content_type": "web",
502
+ "processed_at": datetime.now().isoformat(),
503
+ "tags": ", ".join(tags),
504
+ "summary": summary,
505
+ "doc_id": doc_id
506
+ }
507
+
508
+ collection.add(
509
+ documents=chunks,
510
+ ids=chunk_ids,
511
+ metadatas=[metadata for _ in chunks]
512
+ )
513
+
514
+ return {
515
+ "success": True,
516
+ "doc_id": doc_id,
517
+ "url": source,
518
+ "tags": tags,
519
+ "summary": summary,
520
+ "chunks_processed": len(chunks),
521
+ "metadata": metadata,
522
+ "search_query": url_or_query if not is_url else None
523
+ }
524
+
525
+ except Exception as e:
526
+ logger.error(f"Error processing web content: {str(e)}")
527
+ return {
528
+ "success": False,
529
+ "error": str(e)
530
+ }
531
+
532
+ async def search_knowledge_base(query: str, limit: int = 5) -> List[Dict[str, Any]]:
533
+ """Perform semantic search in the knowledge base"""
534
+ try:
535
+ results = collection.query(
536
+ query_texts=[query],
537
+ n_results=limit
538
+ )
539
+
540
+ if not results["ids"][0]:
541
+ return []
542
+
543
+ # Format results
544
+ formatted_results = []
545
+ seen_docs = set()
546
+
547
+ for i, doc_id in enumerate(results["ids"][0]):
548
+ metadata = results["metadatas"][0][i]
549
+
550
+ # Deduplicate by document
551
+ if metadata["doc_id"] not in seen_docs:
552
+ seen_docs.add(metadata["doc_id"])
553
+ formatted_results.append({
554
+ "doc_id": metadata["doc_id"],
555
+ "source": metadata.get("source", "Unknown"),
556
+ "tags": metadata.get("tags", "").split(", "),
557
+ "summary": metadata.get("summary", ""),
558
+ "relevance_score": 1 - results["distances"][0][i],
559
+ "processed_at": metadata.get("processed_at", "")
560
+ })
561
+
562
+ return formatted_results
563
+
564
+ except Exception as e:
565
+ logger.error(f"Error searching knowledge base: {str(e)}")
566
+ return []
567
+
568
+ async def get_document_details(doc_id: str) -> Dict[str, Any]:
569
+ """Get detailed information about a document"""
570
+ try:
571
+ results = collection.get(
572
+ where={"doc_id": doc_id},
573
+ limit=1
574
+ )
575
+
576
+ if not results["ids"]:
577
+ return {"error": "Document not found"}
578
+
579
+ metadata = results["metadatas"][0]
580
+ return {
581
+ "doc_id": doc_id,
582
+ "source": metadata.get("source", "Unknown"),
583
+ "tags": metadata.get("tags", "").split(", "),
584
+ "summary": metadata.get("summary", ""),
585
+ "processed_at": metadata.get("processed_at", ""),
586
+ "file_type": metadata.get("file_type", ""),
587
+ "content_preview": results["documents"][0][:500] + "..."
588
+ }
589
+
590
+ except Exception as e:
591
+ logger.error(f"Error getting document details: {str(e)}")
592
+ return {"error": str(e)}
requirements.txt CHANGED
@@ -1,12 +1,23 @@
 
1
 
2
- mistralai
 
 
 
 
 
 
3
  python-dotenv
4
- gradio>=4.0
5
- fastmcp>=2.0
6
- chromadb
7
- sentence-transformers
8
- unstructured
9
- requests
10
  beautifulsoup4
11
- plotly
12
- networkx
 
 
 
 
 
1
+ # Requirements for the project
2
 
3
+ gradio==4.44.1
4
+ mcp==1.0.0
5
+ fastmcp==0.1.0
6
+ chromadb==0.4.24
7
+ mistralai==0.4.2
8
+ anthropic
9
+ aiohttp
10
  python-dotenv
11
+ sentence-transformers==2.7.0
12
+ plotly==5.22.0
13
+ pandas==2.2.2
14
+ numpy==1.26.4
15
+ PyPDF2
16
+ python-docx
17
  beautifulsoup4
18
+ markdown
19
+ ebooklib
20
+ newspaper3k
21
+ trafilatura
22
+ duckduckgo-search
23
+ requests