Ezi Ozoani commited on
Commit
4c5342d
·
1 Parent(s): bb7b5d0
Files changed (3) hide show
  1. Dockerfile +27 -0
  2. main.py +168 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.9 image
2
+ FROM python:3.11-slim-bullseye
3
+
4
+ # Set the working directory to /code
5
+ WORKDIR /code
6
+
7
+ # Copy the current directory contents into the container at /code
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Set up a new user named "user" with user ID 1000
14
+ RUN useradd -m -u 1000 user
15
+ # Switch to the "user" user
16
+ USER user
17
+ # Set home to the user's home directory
18
+ ENV HOME=/home/user \
19
+ PATH=/home/user/.local/bin:$PATH
20
+
21
+ # Set the working directory to the user's home directory
22
+ WORKDIR $HOME/app
23
+
24
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
+ COPY --chown=user . $HOME/app
26
+
27
+ CMD ["uvicorn", "main:app","--proxy-headers", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from difflib import SequenceMatcher
4
+ from typing import Any, Dict, Optional, Tuple
5
+
6
+ from fastapi import FastAPI, Request, Response
7
+ from huggingface_hub import (DatasetCard, HfApi, ModelCard, comment_discussion,
8
+ create_discussion, get_discussion_details,
9
+ get_repo_discussions, login)
10
+ from huggingface_hub.utils import EntryNotFoundError
11
+ from tabulate import tabulate
12
+
13
+ KEY = os.environ.get("WEBHOOK_SECRET")
14
+ HF_TOKEN = os.environ.get("HF_TOKEN")
15
+
16
+ api = HfApi(token=HF_TOKEN)
17
+ login(HF_TOKEN)
18
+
19
+ app = FastAPI()
20
+
21
+
22
+ @app.get("/")
23
+ def read_root():
24
+ data = """
25
+ <h2 style="text-align:center">Metadata Review Bot</h2>
26
+ <p style="text-align:center">This is a demo app showing how to use webhooks to automate metadata review for models and datasets shared on the Hugging Face Hub.</p>
27
+ """
28
+ return Response(content=data, media_type="text/html")
29
+
30
+
31
+ def similar(a, b):
32
+ """Check similarity of two sequences"""
33
+ return SequenceMatcher(None, a, b).ratio()
34
+
35
+
36
+ def create_metadata_key_dict(card_data, repo_type: str):
37
+ shared_keys = ["tags", "license"]
38
+ if repo_type == "model":
39
+ model_keys = ["library_name", "datasets", "metrics", "co2", "pipeline_tag"]
40
+ shared_keys.extend(model_keys)
41
+ keys = shared_keys
42
+ return {key: card_data.get(key) for key in keys}
43
+ if repo_type == "dataset":
44
+ data_keys = [
45
+ "pretty_name",
46
+ "size_categories",
47
+ "task_categories",
48
+ "task_ids",
49
+ "source_datasets",
50
+ ]
51
+ shared_keys.extend(data_keys)
52
+ keys = shared_keys
53
+ return {key: card_data.get(key) for key in keys}
54
+
55
+
56
+ def create_metadata_breakdown_table(desired_metadata_dictionary):
57
+ data = {k:v or "Field Missing" for k,v in desired_metadata_dictionary.items()}
58
+ metadata_fields_column = list(data.keys())
59
+ metadata_values_column = list(data.values())
60
+ table_data = list(zip(metadata_fields_column, metadata_values_column))
61
+ return tabulate(
62
+ table_data, tablefmt="github", headers=("Metadata Field", "Provided Value")
63
+ )
64
+
65
+
66
+ def calculate_grade(desired_metadata_dictionary):
67
+ metadata_values = list(desired_metadata_dictionary.values())
68
+ score = sum(1 if field else 0 for field in metadata_values) / len(metadata_values)
69
+ return round(score, 2)
70
+
71
+
72
+ def create_markdown_report(
73
+ desired_metadata_dictionary, repo_name, repo_type, score, update: bool = False
74
+ ):
75
+ report = f"""# {repo_type.title()} metadata report card {"(updated)" if update else ""}
76
+ \n
77
+ This is an automatically produced metadata quality report card for {repo_name}. This report is meant as a POC!
78
+ \n
79
+ ## Breakdown of metadata fields for your{repo_type}
80
+ \n
81
+ {create_metadata_breakdown_table(desired_metadata_dictionary)}
82
+ \n
83
+ You scored a metadata coverage grade of: **{score}**% \n {f"We're not angry we're just disappointed! {repo_type.title()} metadata is super important. Please try harder..."
84
+ if score <= 0.5 else f"Not too shabby! Make sure you also fill in a {repo_type} card too!"}
85
+ """
86
+ return report
87
+
88
+
89
+ def parse_webhook_post(data: Dict[str, Any]) -> Optional[Tuple[str, str]]:
90
+ event = data["event"]
91
+ if event["scope"] != "repo":
92
+ return None
93
+ repo = data["repo"]
94
+ repo_name = repo["name"]
95
+ repo_type = repo["type"]
96
+ if repo_type not in {"model", "dataset"}:
97
+ raise ValueError("Unknown hub type")
98
+ return repo_type, repo_name
99
+
100
+
101
+ def load_repo_card_metadata(repo_type, repo_name):
102
+ if repo_type == "dataset":
103
+ try:
104
+ return DatasetCard.load(repo_name).data.to_dict()
105
+ except EntryNotFoundError:
106
+ return {}
107
+ if repo_type == "model":
108
+ try:
109
+ return ModelCard.load(repo_name).data.to_dict()
110
+ except EntryNotFoundError:
111
+ return {}
112
+
113
+
114
+ def create_or_update_report(data):
115
+ if parsed_post := parse_webhook_post(data):
116
+ repo_type, repo_name = parsed_post
117
+ else:
118
+ return Response("Unable to parse webhook data", status_code=400)
119
+ card_data = load_repo_card_metadata(repo_type, repo_name)
120
+ desired_metadata_dictionary = create_metadata_key_dict(card_data, repo_type)
121
+ score = calculate_grade(desired_metadata_dictionary)
122
+ report = create_markdown_report(
123
+ desired_metadata_dictionary, repo_name, repo_type, score, update=False
124
+ )
125
+ repo_discussions = get_repo_discussions(
126
+ repo_name,
127
+ repo_type=repo_type,
128
+ )
129
+ for discussion in repo_discussions:
130
+ if (
131
+ discussion.title == "Metadata Report Card" and discussion.status == "open"
132
+ ): # An existing open report card thread
133
+ discussion_details = get_discussion_details(
134
+ repo_name, discussion.num, repo_type=repo_type
135
+ )
136
+ last_comment = discussion_details.events[-1].content
137
+ if similar(report, last_comment) <= 0.999:
138
+ report = create_markdown_report(
139
+ desired_metadata_dictionary,
140
+ repo_name,
141
+ repo_type,
142
+ score,
143
+ update=True,
144
+ )
145
+ comment_discussion(
146
+ repo_name,
147
+ discussion.num,
148
+ comment=report,
149
+ repo_type=repo_type,
150
+ )
151
+ return True
152
+ create_discussion(
153
+ repo_name,
154
+ "Metadata Report Card",
155
+ description=report,
156
+ repo_type=repo_type,
157
+ )
158
+ return True
159
+
160
+
161
+ @app.post("/webhook")
162
+ async def webhook(request: Request):
163
+ if request.method == "POST":
164
+ if request.headers.get("X-Webhook-Secret") != KEY:
165
+ return Response("Invalid secret", status_code=401)
166
+ data = await request.json()
167
+ result = create_or_update_report(data)
168
+ return "Webhook received!" if result else result
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ huggingface_hub==0.12.0
2
+ tabulate==0.9.0
3
+ fastapi==0.89.1
4
+ uvicorn==0.20.0