Spaces:

nbroad
/

hf-discussion-search

Running

App Files Files Community

nbroad commited on Oct 4, 2024

Commit

1606cd0

verified ·

1 Parent(s): 5b96dd0

big update

Browse files

Files changed (4) hide show

app.py +35 -32
constants.py +15 -0
requirements.txt +5 -2
update.py +198 -0

app.py CHANGED Viewed

@@ -1,18 +1,21 @@
-from fasthtml.common import *
-from datetime import datetime, timedelta
-import requests
-from datetime import datetime
 import json
-from markdown import markdown
 from dotenv import load_dotenv
 loaded = load_dotenv("./.env", override=True)
 print("Loaded .env file:", loaded)
-API_URL = os.getenv("API_URL")
-API_KEY = os.getenv("MS_SEARCH_KEY")
 css_content = open("styles.css").read()
@@ -82,35 +85,30 @@ def iso_to_unix_timestamp(iso_string):
 def unix_timestamp_to_nice_format(timestamp):
-    dt = datetime.fromtimestamp(timestamp)
-    return dt.strftime("%b %d, %Y")
 def make_query(query, start_date, end_date, page=1, limit=10):
-    url = f"{API_URL}/indexes/comments/search"
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {API_KEY}",
-    }
     after_timestamp = iso_to_unix_timestamp(start_date)
-    before_timestamp = iso_to_unix_timestamp(end_date)
-    query = {
-        "q": query,
         "limit": limit,
         "offset": (page - 1) * limit,
-        "filter": f"comment_updatedAt_timestamp >= {after_timestamp} AND comment_updatedAt_timestamp < {before_timestamp}",
-        "attributesToCrop": ["comment_text"],
         "cropLength": 30,
-        "attributesToHighlight": ["comment_text", "discussion_title"],
         "highlightPreTag": '<span class="highlight">',
         "highlightPostTag": "</span>",
     }
-    response = requests.post(url, headers=headers, json=query)
-    return response.json()
 def search_results(query, start_date, end_date, page=1):
@@ -119,9 +117,7 @@ def search_results(query, start_date, end_date, page=1):
     return Div(
         make_results_bar(raw_results),
         Div(*[make_card(r) for r in raw_results["hits"]]),
-        make_pagination(
-            query, start_date, end_date, page, raw_results["estimatedTotalHits"]
-        ),
         id="search-results",
     )
@@ -138,13 +134,14 @@ def make_results_bar(results):
 def make_card(result):
     result = result["_formatted"]
-    url = f"https://hf.co/{result['repo_id']}/discussions/{result['discussion_num']}"
-    date = unix_timestamp_to_nice_format(int(result["comment_updatedAt_timestamp"]))
     return Div(
         Div(
-            Strong(NotStr(result["discussion_title"])),
-            P(NotStr(result["comment_text"]), cls="comment-text"),
             Div(Span(date)),
             A(url, href=url, target="_blank"),
         ),
@@ -152,7 +149,7 @@ def make_card(result):
     )
-def make_pagination(query, start_date, end_date, current_page, total_hits, limit=10):
     total_pages = -(-total_hits // limit)  # Ceiling division
     children = []
@@ -218,4 +215,10 @@ def post(query: str, start_date: str, end_date: str, page: int = 1):
     return search_results(query, start_date, end_date, page)
-serve()

 import json
+import os
+from datetime import datetime, timezone, timedelta
+import meilisearch
+from fasthtml.common import *
+from markdown import markdown
 from dotenv import load_dotenv
+from constants import MeilisearchIndexFields
+from update import process_webhook
 loaded = load_dotenv("./.env", override=True)
 print("Loaded .env file:", loaded)
+MS_URL = os.getenv("MS_URL")
+MS_SEARCH_KEY = os.getenv("MS_SEARCH_KEY")
+ms_client = meilisearch.Client(MS_URL, MS_SEARCH_KEY)
 css_content = open("styles.css").read()
 def unix_timestamp_to_nice_format(timestamp):
+    dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+    return dt.strftime("%b %d, %Y at %H:%M UTC")
 def make_query(query, start_date, end_date, page=1, limit=10):
+    twenty_three_hours_59_minutes_59_seconds_in_seconds = (23 * 60 + 59) * 60 + 59
     after_timestamp = iso_to_unix_timestamp(start_date)
+    before_timestamp = iso_to_unix_timestamp(end_date) + twenty_three_hours_59_minutes_59_seconds_in_seconds
+    options = {
         "limit": limit,
         "offset": (page - 1) * limit,
+        "filter": f"{MeilisearchIndexFields.UPDATED_AT.value} >= {after_timestamp} AND {MeilisearchIndexFields.UPDATED_AT.value} < {before_timestamp}",
+        "attributesToCrop": [MeilisearchIndexFields.CONTENT.value],
         "cropLength": 30,
+        "attributesToHighlight": [MeilisearchIndexFields.CONTENT.value, MeilisearchIndexFields.TITLE.value],
         "highlightPreTag": '<span class="highlight">',
         "highlightPostTag": "</span>",
     }
+    return ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).search(query=query, opt_params=options)
 def search_results(query, start_date, end_date, page=1):
     return Div(
         make_results_bar(raw_results),
         Div(*[make_card(r) for r in raw_results["hits"]]),
+        make_pagination(page, raw_results["estimatedTotalHits"]),
         id="search-results",
     )
 def make_card(result):
     result = result["_formatted"]
+    url = result[MeilisearchIndexFields.URL.value]
+    date = unix_timestamp_to_nice_format(int(result[MeilisearchIndexFields.UPDATED_AT.value]))
     return Div(
         Div(
+            Strong(NotStr(result[MeilisearchIndexFields.TITLE.value])),
+            P(NotStr(result[MeilisearchIndexFields.CONTENT.value]), cls="comment-text"),
             Div(Span(date)),
             A(url, href=url, target="_blank"),
         ),
     )
+def make_pagination(current_page, total_hits, limit=10):
     total_pages = -(-total_hits // limit)  # Ceiling division
     children = []
     return search_results(query, start_date, end_date, page)
+@app.post("/webhook")
+async def hf_webhook(request):
+    return await process_webhook(request)
+serve()

constants.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from enum import Enum
+class MeilisearchIndexFields(Enum):
+    INDEX_NAME = "comments"
+    ID = "comment_id"
+    CONTENT = "content"
+    TITLE = "title"
+    STATUS = "status"
+    AUTHOR = "author"
+    URL = "url"
+    REPO_ID = "repo_id"
+    UPDATED_AT = "updatedAt"

requirements.txt CHANGED Viewed

@@ -1,5 +1,8 @@
 uvicorn
 python-fasthtml
 python-dotenv
-fasthtml-hf==0.1.4
-markdown

 uvicorn
 python-fasthtml
 python-dotenv
+fasthtml-hf
+markdown
+meilisearch
+huggingface_hub
+requests

update.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+This file has functions to update the meilisearch index with new comments.
+Payload from HF webhooklooks like this:
+    {
+  "event": {
+    "action": "update",
+    "scope": "discussion.comment"
+  },
+  "repo": {
+    "type": "dataset",
+    "name": "allenai/objaverse",
+    "id": "63977bb96bdef8095268ded0",
+    "private": false,
+    "url": {
+      "web": "https://huggingface.co/datasets/allenai/objaverse",
+      "api": "https://huggingface.co/api/datasets/allenai/objaverse"
+    },
+    "owner": {
+      "id": "5e70f3648ce3c604d78fe132"
+    }
+  },
+  "discussion": {
+    "id": "66f1a1092eb1ea2422555d24",
+    "title": "PullRequest",
+    "url": {
+      "web": "https://huggingface.co/datasets/allenai/objaverse/discussions/63",
+      "api": "https://huggingface.co/api/datasets/allenai/objaverse/discussions/63"
+    },
+    "status": "draft",
+    "author": {
+      "id": "6673e848436907f83a815ab0"
+    },
+    "num": 63,
+    "isPullRequest": true,
+    "changes": {
+      "base": "refs/heads/main"
+    }
+  },
+  "comment": {
+    "id": "66f1a1092eb1ea2422555d25",
+    "author": {
+      "id": "6673e848436907f83a815ab0"
+    },
+    "hidden": true,
+    "url": {
+      "web": "https://huggingface.co/datasets/allenai/objaverse/discussions/63#66f1a1092eb1ea2422555d25"
+    }
+  },
+  "webhook": {
+    "id": "66d7991f9b7da501cd100d95",
+    "version": 3
+  }
+}
+"""
+import time
+import json
+import os
+from datetime import datetime, timezone
+import requests
+from dotenv import load_dotenv
+from huggingface_hub import HfApi
+from meilisearch import Client
+from huggingface_hub import HfApi
+from constants import MeilisearchIndexFields
+load_dotenv(".env", override=True)
+WEBHOOK_SECRET = os.getenv("WEBHOOK_SECRET")
+MEILISEARCH_URL = os.getenv("MS_URL")
+MEILISEARCH_KEY = os.getenv("MS_ADMIN_KEY")
+ms_client = Client(MEILISEARCH_URL, MEILISEARCH_KEY)
+api = HfApi(token=os.environ["HF_WEBHOOK_TOKEN"])
+async def process_webhook(request):
+    payload = await request.body()
+    payload = payload.decode("utf-8")
+    print(payload)
+    payload = json.loads(payload)
+    secret = request.headers.get("X-Webhook-Secret")
+    if secret != WEBHOOK_SECRET:
+        print("Invalid secret")
+        return {"error": "Invalid secret"}, 400
+    if payload["repo"]["type"] == "model":
+        if "discussion" not in payload or payload["discussion"]["isPullRequest"]:
+            return {"status": "skipped"}, 200
+        changing_status = "comment" not in payload and payload["event"]["action"] == "update"
+        if changing_status:
+            update_discussion_status(payload)
+        else:
+            add_new_comment(payload)
+    return {"status": "success"}, 200
+def user_id_to_username(user_id):
+    api_url = f"https://huggingface.co/api/users/{user_id}/overview"
+    try:
+        response = requests.get(api_url)
+        return response.json()["user"]
+    except Exception as e:
+        print(f"Couldn't get username for id {user_id}: {e}")
+        return user_id
+def add_new_comment(payload):
+    comment = payload["comment"].get("content", "")
+    comment_id = payload["comment"]["id"]
+    repo_id = payload["repo"]["name"]
+    title = payload["discussion"]["title"]
+    author_id = payload["comment"]["author"]["id"]
+    author = user_id_to_username(author_id)
+    url = payload["discussion"]["url"]["web"]
+    updatedAt = int(datetime.now(timezone.utc).timestamp())
+    status = payload["discussion"]["status"]
+    melisearch_payload = {
+        MeilisearchIndexFields.ID.value: comment_id,
+        MeilisearchIndexFields.TITLE.value: title,
+        MeilisearchIndexFields.STATUS.value: status,
+        MeilisearchIndexFields.AUTHOR.value: author,
+        MeilisearchIndexFields.URL.value: url,
+        MeilisearchIndexFields.REPO_ID.value: repo_id,
+        MeilisearchIndexFields.CONTENT.value: comment,
+        MeilisearchIndexFields.UPDATED_AT.value: updatedAt,
+    }
+    ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).add_documents([melisearch_payload])
+def update_discussion_status(payload):
+    # If closing and commenting at the same time,
+    # the comment comes with status = open after the webhook that says the discussion is closed.
+    # Adding the sleep ensures the update comes afterwards
+    time.sleep(1)
+    url = payload["discussion"]["url"]["web"]
+    status = payload["discussion"]["status"]
+    existing_results = ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).search(
+        query="",
+        opt_params={"filter": f"url = '{url}'"}
+    )
+    if len(existing_results["hits"]) > 0:
+        docs2update = [
+            {MeilisearchIndexFields.ID.value: d[MeilisearchIndexFields.ID.value], MeilisearchIndexFields.STATUS.value: status}
+            for d in existing_results["hits"]
+        ]
+        update_request = ms_client.index(MeilisearchIndexFields.INDEX_NAME.value).update_documents(docs2update)
+        print("Update request:", update_request)
+def update_webhooks():
+    """
+    Delete the old
+    """
+    existing_webhooks = api.list_webhooks()
+    webhook_url = os.environ["HF_WEBHOOK_URL"]
+    id2update = [x for x in existing_webhooks if x.url == webhook_url]
+    if len(id2update) > 1:
+        print("More than one webhook found")
+        print(id2update)
+        print("updating the first one")
+    id2update = id2update[0]
+    # get trending models
+    trending_models = api.list_models(sort="likes7d", direction=-1, limit=100)
+    to_add = []