advanced

Running on CPU Upgrade

App Files Files Community

alozowski HF Staff commited on Apr 1

Commit

fdfafe5

1 Parent(s): 0203fca

Apply Ruff

Browse files

Files changed (6) hide show

yourbench_space/__init__.py +2 -1
yourbench_space/app.py +20 -13
yourbench_space/evaluation.py +6 -3
yourbench_space/leaderboard_space/app.py +3 -3
yourbench_space/lighteval_task/yourbench_task.py +8 -12
yourbench_space/utils.py +3 -1

yourbench_space/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from pathlib import Path
 import os
 PATH = Path("/home/user/app") if os.environ.get("SYSTEM") == "spaces" else Path("app")

 import os
+from pathlib import Path
 PATH = Path("/home/user/app") if os.environ.get("SYSTEM") == "spaces" else Path("app")

yourbench_space/app.py CHANGED Viewed

@@ -9,17 +9,16 @@ from loguru import logger
 import gradio as gr
 from datasets import load_dataset
-from huggingface_hub import whoami, HfApi
 from yourbench_space import PATH
 from yourbench_space.utils import (
-    STAGE_DISPLAY_MAP,
     STAGES,
     SubprocessManagerGroup,
     save_files,
-    on_generation_succsess,
     update_dataset,
     map_stage_names,
     is_running_locally,
 )
 from yourbench_space.config import generate_and_save_config
 from yourbench_space.evaluation import run_evaluations, create_eval_file
@@ -235,11 +234,12 @@ def init_session(profile: gr.OAuthProfile | None):
     logger.info(f"Started session for {local_uuid}")
     return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
 btn_launch_evals = gr.Button(
-    "🚀 Launch Evaluation",
     visible=True,
-    interactive=False,  # Start non-interactive
-    variant="primary"
 )
 with gr.Blocks(theme=gr.themes.Default()) as app:
@@ -251,8 +251,12 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
         with gr.Tab("Choose Documents & Settings", id=0):
             with gr.Column():
                 gr.Markdown("### 📄 Choose your documents and settings")
-                gr.Markdown("Upload your source documents that will form the knowledge base for your benchmark. Set a Hugging Face organization and dataset name.")
-                gr.Markdown("This step also generates a config file for running the benchmark pipeline. You can download it to run YourBench locally.")
                 with gr.Row():
                     with gr.Accordion("Hugging Face Settings"):
@@ -320,7 +324,9 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
         with gr.Tab("Run Benchmark Pipeline", id=1):
             with gr.Column():
                 gr.Markdown("### ⚙️ Run the benchmark generation pipeline")
-                gr.Markdown("Start the pipeline to process documents, generate questions, and build the private evaluation dataset. Watch logs, track progress, and preview the results.")
                 with gr.Row():
                     start_button = gr.Button("Start Task")
@@ -374,9 +380,9 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
                     stages_table.change(
                         on_generation_succsess,
                         inputs=stages_table,
-                        outputs=[tabs,btn_launch_evals],
                     )
                     # TODO: this timer should only be active when the second tab is passed to active for the first time
                     log_timer = gr.Timer(1.0, active=True)
                     log_timer.tick(
@@ -388,7 +394,9 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
         with gr.Tab("Evaluate Models on Benchmark", id=2):
             with gr.Column():
                 gr.Markdown("### 🧪 Evaluate models on your benchmark")
-                gr.Markdown("Runs the evaluation with [Lighteval](https://github.com/huggingface/lighteval) on the resulted dataset using 5+ open models, then deploys a leaderboard as a Hugging Face Space under your org.")
                 with gr.Row():
                     with gr.Column():
@@ -406,7 +414,6 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
                 )
                 clear_status_btn.click(lambda: "", outputs=eval_status)
     app.load(init_session, outputs=session_state)
 app.launch(allowed_paths=[PATH])

 import gradio as gr
 from datasets import load_dataset
+from huggingface_hub import HfApi, whoami
 from yourbench_space import PATH
 from yourbench_space.utils import (
     STAGES,
     SubprocessManagerGroup,
     save_files,
     update_dataset,
     map_stage_names,
     is_running_locally,
+    on_generation_succsess,
 )
 from yourbench_space.config import generate_and_save_config
 from yourbench_space.evaluation import run_evaluations, create_eval_file
     logger.info(f"Started session for {local_uuid}")
     return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
 btn_launch_evals = gr.Button(
+    "🚀 Launch Evaluation",
     visible=True,
+    interactive=True,  # Start non-interactive
+    variant="primary",
 )
 with gr.Blocks(theme=gr.themes.Default()) as app:
         with gr.Tab("Choose Documents & Settings", id=0):
             with gr.Column():
                 gr.Markdown("### 📄 Choose your documents and settings")
+                gr.Markdown(
+                    "Upload your source documents that will form the knowledge base for your benchmark. Set a Hugging Face organization and dataset name."
+                )
+                gr.Markdown(
+                    "This step also generates a config file for running the benchmark pipeline. You can download it to run YourBench locally."
+                )
                 with gr.Row():
                     with gr.Accordion("Hugging Face Settings"):
         with gr.Tab("Run Benchmark Pipeline", id=1):
             with gr.Column():
                 gr.Markdown("### ⚙️ Run the benchmark generation pipeline")
+                gr.Markdown(
+                    "Start the pipeline to process documents, generate questions, and build the private evaluation dataset. Watch logs, track progress, and preview the results."
+                )
                 with gr.Row():
                     start_button = gr.Button("Start Task")
                     stages_table.change(
                         on_generation_succsess,
                         inputs=stages_table,
+                        outputs=[tabs, btn_launch_evals],
                     )
                     # TODO: this timer should only be active when the second tab is passed to active for the first time
                     log_timer = gr.Timer(1.0, active=True)
                     log_timer.tick(
         with gr.Tab("Evaluate Models on Benchmark", id=2):
             with gr.Column():
                 gr.Markdown("### 🧪 Evaluate models on your benchmark")
+                gr.Markdown(
+                    "Runs the evaluation with [Lighteval](https://github.com/huggingface/lighteval) on the resulted dataset using 5+ open models, then deploys a leaderboard as a Hugging Face Space under your org."
+                )
                 with gr.Row():
                     with gr.Column():
                 )
                 clear_status_btn.click(lambda: "", outputs=eval_status)
     app.load(init_session, outputs=session_state)
 app.launch(allowed_paths=[PATH])

yourbench_space/evaluation.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import os
-import subprocess
 import asyncio
 from pathlib import Path
-from yourbench_space.leaderboard_space.env import INIT_MODELS
 from loguru import logger
 ON_SPACES = os.environ.get("system") == "spaces"
-OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder
 def create_eval_file(eval_ds_name: str):
@@ -15,6 +17,7 @@ def create_eval_file(eval_ds_name: str):
     template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
     subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
 async def run_process(args: list) -> dict:
     process = await asyncio.create_subprocess_exec(
         *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE

 import os
 import asyncio
+import subprocess
 from pathlib import Path
 from loguru import logger
+from yourbench_space.leaderboard_space.env import INIT_MODELS
 ON_SPACES = os.environ.get("system") == "spaces"
+OUTPUT_DIR = "/data" if ON_SPACES else "."  # TODO: fix the space folder
 def create_eval_file(eval_ds_name: str):
     template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
     subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
 async def run_process(args: list) -> dict:
     process = await asyncio.create_subprocess_exec(
         *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE

yourbench_space/leaderboard_space/app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import gradio as gr
 with gr.Blocks(
     title="YourBench Leaderboard",
     css="button { margin: 0 10px; padding: 5px 15px; }",
-) as demo:
     # DISPLAY TABLE AND ANALYSIS
     title = gr.Markdown(f"YourBench auto-Leaderboard for {TASK}")
     leaderboard = gr.DataFrame(label="Results", interactive=False)
@@ -21,6 +21,6 @@ with gr.Blocks(
     samples_ix.change(update_examples, samples_ix, [easy_samples, hard_samples, all_samples])
-    demo.load(run_pipeline, [samples_ix], [leaderboard, easy_samples, hard_samples, all_samples])
-demo.launch()

 with gr.Blocks(
     title="YourBench Leaderboard",
     css="button { margin: 0 10px; padding: 5px 15px; }",
+) as app:
     # DISPLAY TABLE AND ANALYSIS
     title = gr.Markdown(f"YourBench auto-Leaderboard for {TASK}")
     leaderboard = gr.DataFrame(label="Results", interactive=False)
     samples_ix.change(update_examples, samples_ix, [easy_samples, hard_samples, all_samples])
+    app.load(run_pipeline, [samples_ix], [leaderboard, easy_samples, hard_samples, all_samples])
+app.launch()

yourbench_space/lighteval_task/yourbench_task.py CHANGED Viewed

@@ -21,21 +21,20 @@
 # SOFTWARE.
-import logging
 import re
 import numpy as np
 from aenum import extend_enum
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import JudgeLLM
 from lighteval.metrics.utils.metric_utils import (
-    CorpusLevelMetricGrouping,
-    MetricCategory,
     MetricUseCase,
 )
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
 logger = logging.getLogger(__name__)
@@ -186,7 +185,6 @@ class JudgeLLMYourBench(JudgeLLM):
             max_tokens=2048,
         )
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
         # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
@@ -202,11 +200,9 @@ class JudgeLLMYourBench(JudgeLLM):
         metrics = []
         for i in range(len(sample_ids)):
-            metrics.append(
-                {
-                    "accuracy": score[i],
-                }
-            )
         return metrics

 # SOFTWARE.
 import re
+import logging
 import numpy as np
 from aenum import extend_enum
+from lighteval.tasks.requests import Doc
 from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.metrics.metrics_sample import JudgeLLM
 from lighteval.metrics.utils.metric_utils import (
     MetricUseCase,
+    MetricCategory,
+    CorpusLevelMetricGrouping,
 )
 logger = logging.getLogger(__name__)
             max_tokens=2048,
         )
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
         # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         metrics = []
         for i in range(len(sample_ids)):
+            metrics.append({
+                "accuracy": score[i],
+            })
         return metrics

yourbench_space/utils.py CHANGED Viewed

@@ -11,7 +11,6 @@ from loguru import logger
 import gradio as gr
 from datasets import load_dataset
 from yourbench_space import PATH
@@ -129,11 +128,13 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
     return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
 def should_enable_eval_tab(stages):
     logger.info(f"Stages received: {stages}")
     logger.info(f"Lighteval stage name: {STAGE_DISPLAY_MAP['lighteval']}")
     return STAGE_DISPLAY_MAP["lighteval"] in stages
 def on_generation_succsess(stages):
     stages = stages or []
     if STAGE_DISPLAY_MAP["lighteval"] in stages:
@@ -141,6 +142,7 @@ def on_generation_succsess(stages):
         return gr.update(selected=2), gr.update(interactive=True, visible=True)
     return gr.update(), gr.update(interactive=False, visible=True)
 class SubprocessManagerGroup:
     """Instanciates one manager per user (should be used as a singleton class)"""

 import gradio as gr
 from datasets import load_dataset
 from yourbench_space import PATH
     return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
 def should_enable_eval_tab(stages):
     logger.info(f"Stages received: {stages}")
     logger.info(f"Lighteval stage name: {STAGE_DISPLAY_MAP['lighteval']}")
     return STAGE_DISPLAY_MAP["lighteval"] in stages
 def on_generation_succsess(stages):
     stages = stages or []
     if STAGE_DISPLAY_MAP["lighteval"] in stages:
         return gr.update(selected=2), gr.update(interactive=True, visible=True)
     return gr.update(), gr.update(interactive=False, visible=True)
 class SubprocessManagerGroup:
     """Instanciates one manager per user (should be used as a singleton class)"""