Spaces:

Tonic
/

chonkie

Sleeping

App Files Files Community

Tonic commited on Aug 13

Commit

4da89a8

1 Parent(s): 907eee3

adds chonkie demo

Browse files

Files changed (3) hide show

__pycache__/analytics.cpython-313.pyc +0 -0
app.py +360 -25
requirements.txt +4 -2

__pycache__/analytics.cpython-313.pyc ADDED Viewed

Binary file (5.97 kB). View file

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
 import asyncio
 import time
-from typing import Optional
 from datetime import datetime
 import httpx
 import trafilatura
@@ -13,10 +14,20 @@ from limits.aio.strategies import MovingWindowRateLimiter
 from analytics import record_request, last_n_days_df, last_n_days_avg_time_df
 # Configuration
-SERPER_API_KEY = os.getenv("SERPER_API_KEY")
 SERPER_SEARCH_ENDPOINT = "https://google.serper.dev/search"
 SERPER_NEWS_ENDPOINT = "https://google.serper.dev/news"
-HEADERS = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
 # Rate limiting
 storage = MemoryStorage()
@@ -68,7 +79,7 @@ async def search_web(
     """
     start_time = time.time()
-    if not SERPER_API_KEY:
         await record_request(None, num_results)  # Record even failed requests
         return "Error: SERPER_API_KEY environment variable is not set. Please set it to use this tool."
@@ -87,7 +98,7 @@ async def search_web(
             print(f"[{datetime.now().isoformat()}] Rate limit exceeded")
             duration = time.time() - start_time
             await record_request(duration, num_results)
-            return "Error: Rate limit exceeded. Please try again later (limit: 500 requests per hour)."
         # Select endpoint based on search type
         endpoint = (
@@ -101,7 +112,7 @@ async def search_web(
             payload["page"] = 1
         async with httpx.AsyncClient(timeout=15) as client:
-            resp = await client.post(endpoint, headers=HEADERS, json=payload)
         if resp.status_code != 200:
             duration = time.time() - start_time
@@ -204,6 +215,144 @@ async def search_web(
         return f"Error occurred while searching: {str(e)}. Please try again or check your query."
 # Create Gradio interface
 with gr.Blocks(title="Web Search MCP Server") as demo:
     gr.HTML(
@@ -267,22 +416,42 @@ with gr.Blocks(title="Web Search MCP Server") as demo:
                     )
             with gr.Row():
-                num_results_input = gr.Slider(
-                    minimum=1,
-                    maximum=20,
-                    value=4,
-                    step=1,
-                    label="Number of Results",
-                    info="Optional: How many results to fetch (default: 4)",
-                )
-            search_button = gr.Button("Search", variant="primary")
             output = gr.Textbox(
-                label="Extracted Content",
                 lines=25,
                 max_lines=50,
-                info="The extracted article content will appear here",
             )
             # Add examples
@@ -294,12 +463,33 @@ with gr.Blocks(title="Web Search MCP Server") as demo:
                     ["Apple Vision Pro reviews", "search", 4],
                     ["best Italian restaurants NYC", "search", 4],
                 ],
-                inputs=[query_input, search_type_input, num_results_input],
                 outputs=output,
-                fn=search_web,
                 cache_examples=False,
             )
         with gr.Tab("Analytics"):
             gr.Markdown("## Community Usage Analytics")
             gr.Markdown(
@@ -334,10 +524,21 @@ with gr.Blocks(title="Web Search MCP Server") as demo:
                     )
     search_button.click(
-        fn=search_web,  # Use search_web directly instead of search_and_log
-        inputs=[query_input, search_type_input, num_results_input],
         outputs=output,
-        api_name=False,  # Hide this endpoint from API & MCP
     )
     # Load fresh analytics data when the page loads or Analytics tab is clicked
@@ -347,8 +548,142 @@ with gr.Blocks(title="Web Search MCP Server") as demo:
         api_name=False,
     )
-    # Expose search_web as the only MCP tool
-    gr.api(search_web, api_name="search_web")
 if __name__ == "__main__":

 import os
 import asyncio
 import time
+import json
+from typing import Optional, List, Dict, Any
 from datetime import datetime
 import httpx
 import trafilatura
 from analytics import record_request, last_n_days_df, last_n_days_avg_time_df
 # Configuration
+SERPER_API_KEY_ENV = os.getenv("SERPER_API_KEY")
+SERPER_API_KEY_OVERRIDE: Optional[str] = None
 SERPER_SEARCH_ENDPOINT = "https://google.serper.dev/search"
 SERPER_NEWS_ENDPOINT = "https://google.serper.dev/news"
+def _get_serper_api_key() -> Optional[str]:
+    """Return the currently active Serper API key (override wins, else env)."""
+    return (SERPER_API_KEY_OVERRIDE or SERPER_API_KEY_ENV or None)
+def _get_headers() -> Dict[str, str]:
+    api_key = _get_serper_api_key()
+    return {"X-API-KEY": api_key or "", "Content-Type": "application/json"}
 # Rate limiting
 storage = MemoryStorage()
     """
     start_time = time.time()
+    if not _get_serper_api_key():
         await record_request(None, num_results)  # Record even failed requests
         return "Error: SERPER_API_KEY environment variable is not set. Please set it to use this tool."
             print(f"[{datetime.now().isoformat()}] Rate limit exceeded")
             duration = time.time() - start_time
             await record_request(duration, num_results)
+            return "Error: Rate limit exceeded. Please try again later (limit: 360 requests per hour)."
         # Select endpoint based on search type
         endpoint = (
             payload["page"] = 1
         async with httpx.AsyncClient(timeout=15) as client:
+            resp = await client.post(endpoint, headers=_get_headers(), json=payload)
         if resp.status_code != 200:
             duration = time.time() - start_time
         return f"Error occurred while searching: {str(e)}. Please try again or check your query."
+async def search_and_chunk(
+    query: str,
+    search_type: str,
+    num_results: Optional[int],
+    tokenizer_or_token_counter: str,
+    chunk_size: int,
+    chunk_overlap: int,
+    heading_level: int,
+    min_characters_per_chunk: int,
+    max_characters_per_section: int,
+    clean_text: bool,
+) -> str:
+    """
+    Complete flow: search -> fetch -> extract with trafilatura -> chunk with MarkdownChunker/Parser.
+    Returns a JSON string of a list[dict] where each dict is a chunk enriched with source metadata.
+    """
+    start_time = time.time()
+    if not _get_serper_api_key():
+        await record_request(None, num_results)
+        return json.dumps([
+            {"error": "SERPER_API_KEY not set", "hint": "Set env or paste in the UI"}
+        ])
+    # Normalize inputs
+    if num_results is None:
+        num_results = 4
+    num_results = max(1, min(20, int(num_results)))
+    if search_type not in ["search", "news"]:
+        search_type = "search"
+    try:
+        # Rate limit
+        if not await limiter.hit(rate_limit, "global"):
+            duration = time.time() - start_time
+            await record_request(duration, num_results)
+            return json.dumps([
+                {"error": "rate_limited", "limit": "360/hour"}
+            ])
+        endpoint = (
+            SERPER_NEWS_ENDPOINT if search_type == "news" else SERPER_SEARCH_ENDPOINT
+        )
+        payload = {"q": query, "num": num_results}
+        if search_type == "news":
+            payload["type"] = "news"
+            payload["page"] = 1
+        async with httpx.AsyncClient(timeout=15) as client:
+            resp = await client.post(endpoint, headers=_get_headers(), json=payload)
+        if resp.status_code != 200:
+            duration = time.time() - start_time
+            await record_request(duration, num_results)
+            return json.dumps([
+                {"error": "bad_status", "status": resp.status_code}
+            ])
+        results = resp.json().get("news" if search_type == "news" else "organic", [])
+        if not results:
+            duration = time.time() - start_time
+            await record_request(duration, num_results)
+            return json.dumps([])
+        # Fetch pages concurrently
+        urls = [r.get("link") for r in results]
+        async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
+            responses = await asyncio.gather(*[client.get(u) for u in urls], return_exceptions=True)
+        all_chunks: List[Dict[str, Any]] = []
+        for meta, response in zip(results, responses):
+            if isinstance(response, Exception):
+                continue
+            extracted = trafilatura.extract(
+                response.text, include_formatting=True, include_comments=False
+            )
+            if not extracted:
+                continue
+            # Build a markdown doc with metadata header to help heading-aware chunking
+            if search_type == "news":
+                # Parse date if present
+                try:
+                    date_str = meta.get("date", "")
+                    date_iso = (
+                        dateparser.parse(date_str, fuzzy=True).strftime("%Y-%m-%d") if date_str else "Unknown"
+                    )
+                except Exception:
+                    date_iso = "Unknown"
+                markdown_doc = (
+                    f"# {meta.get('title', 'Untitled')}\n\n"
+                    f"**Source:** {meta.get('source', 'Unknown')}   **Date:** {date_iso}\n\n"
+                    f"**URL:** {meta.get('link', '')}\n\n"
+                    f"{extracted.strip()}\n"
+                )
+            else:
+                domain = (meta.get("link", "").split("/")[2].replace("www.", "") if meta.get("link") else "")
+                markdown_doc = (
+                    f"# {meta.get('title', 'Untitled')}\n\n"
+                    f"**Domain:** {domain}\n\n"
+                    f"**URL:** {meta.get('link', '')}\n\n"
+                    f"{extracted.strip()}\n"
+                )
+            # Run markdown chunker
+            chunks = _run_markdown_chunker(
+                markdown_doc,
+                tokenizer_or_token_counter=tokenizer_or_token_counter,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                heading_level=heading_level,
+                min_characters_per_chunk=min_characters_per_chunk,
+                max_characters_per_section=max_characters_per_section,
+                clean_text=clean_text,
+            )
+            # Enrich with metadata for traceability
+            for c in chunks:
+                c.setdefault("source_title", meta.get("title"))
+                c.setdefault("url", meta.get("link"))
+                if search_type == "news":
+                    c.setdefault("source", meta.get("source"))
+                    c.setdefault("date", meta.get("date"))
+                else:
+                    c.setdefault("domain", domain)
+                all_chunks.append(c)
+        duration = time.time() - start_time
+        await record_request(duration, num_results)
+        return json.dumps(all_chunks, ensure_ascii=False)
+    except Exception as e:
+        duration = time.time() - start_time
+        await record_request(duration, num_results)
+        return json.dumps([{"error": str(e)}])
 # Create Gradio interface
 with gr.Blocks(title="Web Search MCP Server") as demo:
     gr.HTML(
                     )
             with gr.Row():
+                with gr.Column(scale=3):
+                    serper_key_input = gr.Textbox(
+                        label="Serper API Key",
+                        placeholder="Enter your Serper API key or set SERPER_API_KEY env var",
+                        type="password",
+                    )
+                with gr.Column(scale=1):
+                    set_key_btn = gr.Button("Save API Key")
+            with gr.Accordion("Chunking Parameters", open=False):
+                with gr.Row():
+                    num_results_input = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=4,
+                        step=1,
+                        label="Number of Results",
+                        info="Results to fetch (1-20)",
+                    )
+                    chunk_size_input = gr.Slider(100, 4000, value=1000, step=50, label="Chunk Size (characters)")
+                    heading_level_input = gr.Slider(1, 6, value=3, step=1, label="Max Heading Level")
+                with gr.Row():
+                    min_chars_input = gr.Slider(0, 1000, value=50, step=10, label="Min characters per chunk")
+                    max_chars_input = gr.Slider(500, 10000, value=4000, step=100, label="Max characters per section")
+                with gr.Row():
+                    tokenizer_input = gr.Dropdown(choices=["character"], value="character", label="Tokenizer")
+                    overlap_input = gr.Slider(0, 400, value=0, step=10, label="Chunk overlap (reserved)")
+                    clean_text_input = gr.Checkbox(value=True, label="Clean text (strip inline markdown/URLs)")
+            search_button = gr.Button("Search + Chunk", variant="primary")
             output = gr.Textbox(
+                label="Chunks (JSON List[Dict])",
                 lines=25,
                 max_lines=50,
+                info="Output is a JSON string list of chunk dicts",
             )
             # Add examples
                     ["Apple Vision Pro reviews", "search", 4],
                     ["best Italian restaurants NYC", "search", 4],
                 ],
+                inputs=[
+                    query_input,
+                    search_type_input,
+                    num_results_input,
+                    tokenizer_input,
+                    chunk_size_input,
+                    overlap_input,
+                    heading_level_input,
+                    min_chars_input,
+                    max_chars_input,
+                    clean_text_input,
+                ],
                 outputs=output,
+                fn=search_and_chunk,
                 cache_examples=False,
             )
+            def _set_serper_key(key: str) -> str:
+                global SERPER_API_KEY_OVERRIDE
+                SERPER_API_KEY_OVERRIDE = (key or "").strip() or None
+                # Minimal validation/echo without exposing the full key
+                if SERPER_API_KEY_OVERRIDE:
+                    return "Serper API key saved in-session."
+                return "Cleared in-session API key. Using environment if set."
+            set_key_btn.click(fn=_set_serper_key, inputs=serper_key_input, outputs=output)
         with gr.Tab("Analytics"):
             gr.Markdown("## Community Usage Analytics")
             gr.Markdown(
                     )
     search_button.click(
+        fn=search_and_chunk,
+        inputs=[
+            query_input,
+            search_type_input,
+            num_results_input,
+            tokenizer_input,
+            chunk_size_input,
+            overlap_input,
+            heading_level_input,
+            min_chars_input,
+            max_chars_input,
+            clean_text_input,
+        ],
         outputs=output,
+        api_name=False,
     )
     # Load fresh analytics data when the page loads or Analytics tab is clicked
         api_name=False,
     )
+    # Expose search_and_chunk as the MCP tool
+    gr.api(search_and_chunk, api_name="search_and_chunk")
+# -------- Markdown chunk helper (from chonkie) --------
+def _run_markdown_chunker(
+    markdown_text: str,
+    tokenizer_or_token_counter: str = "character",
+    chunk_size: int = 1000,
+    chunk_overlap: int = 0,
+    heading_level: int = 3,
+    min_characters_per_chunk: int = 50,
+    max_characters_per_section: int = 4000,
+    clean_text: bool = True,
+) -> List[Dict[str, Any]]:
+    """
+    Use chonkie's MarkdownChunker or MarkdownParser to chunk markdown text and
+    return a List[Dict] with useful fields.
+    This follows the documentation in the chonkie commit introducing MarkdownChunker
+    and its parameters.
+    """
+    markdown_text = markdown_text or ""
+    if not markdown_text.strip():
+        return []
+    # Lazy import so the app can still run without the dependency until this is used
+    try:
+        try:
+            from chonkie import MarkdownParser  # type: ignore
+        except Exception:
+            try:
+                from chonkie.chunker.markdown import MarkdownParser  # type: ignore
+            except Exception:
+                MarkdownParser = None  # type: ignore
+        try:
+            from chonkie import MarkdownChunker  # type: ignore
+        except Exception:
+            from chonkie.chunker.markdown import MarkdownChunker  # type: ignore
+    except Exception as exc:
+        return [{
+            "error": "chonkie not installed",
+            "detail": "Install chonkie from the feat/markdown-chunker branch",
+            "exception": str(exc),
+        }]
+    # Prefer MarkdownParser if available and it yields dicts
+    if 'MarkdownParser' in globals() and MarkdownParser is not None:
+        try:
+            parser = MarkdownParser(
+                tokenizer_or_token_counter=tokenizer_or_token_counter,
+                chunk_size=int(chunk_size),
+                chunk_overlap=int(chunk_overlap),
+                heading_level=int(heading_level),
+                min_characters_per_chunk=int(min_characters_per_chunk),
+                max_characters_per_section=int(max_characters_per_section),
+                clean_text=bool(clean_text),
+            )
+            result = parser.parse(markdown_text) if hasattr(parser, 'parse') else parser(markdown_text)  # type: ignore
+            # If the parser returns list of dicts already, pass-through
+            if isinstance(result, list) and (not result or isinstance(result[0], dict)):
+                return result  # type: ignore
+            # Else, normalize below
+            chunks = result
+        except Exception:
+            # Fall back to chunker if parser invocation fails
+            chunks = None
+    else:
+        chunks = None
+    # Fallback to MarkdownChunker if needed or normalization for non-dicts
+    if chunks is None:
+        chunker = MarkdownChunker(
+            tokenizer_or_token_counter=tokenizer_or_token_counter,
+            chunk_size=int(chunk_size),
+            chunk_overlap=int(chunk_overlap),
+            heading_level=int(heading_level),
+            min_characters_per_chunk=int(min_characters_per_chunk),
+            max_characters_per_section=int(max_characters_per_section),
+            clean_text=bool(clean_text),
+        )
+        if hasattr(chunker, 'chunk'):
+            chunks = chunker.chunk(markdown_text)  # type: ignore
+        elif hasattr(chunker, 'split_text'):
+            chunks = chunker.split_text(markdown_text)  # type: ignore
+        elif callable(chunker):
+            chunks = chunker(markdown_text)  # type: ignore
+        else:
+            return [{"error": "Unknown MarkdownChunker interface"}]
+    # Normalize chunks to list of dicts
+    normalized: List[Dict[str, Any]] = []
+    for c in (chunks or []):
+        if isinstance(c, dict):
+            normalized.append(c)
+            continue
+        item: Dict[str, Any] = {}
+        for field in ("text", "start_index", "end_index", "token_count", "heading", "metadata"):
+            if hasattr(c, field):
+                try:
+                    item[field] = getattr(c, field)
+                except Exception:
+                    pass
+        if not item:
+            # Last resort: string representation
+            item = {"text": str(c)}
+        normalized.append(item)
+    return normalized
+with demo:
+    pass
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
-gradio
 httpx
 trafilatura
 python-dateutil
 limits
-filelock

+gradio[mcp]
 httpx
 trafilatura
 python-dateutil
 limits
+filelock
+pandas
+git+https://github.com/Josephrp/chonkie@feat/markdown-chunker