alozowski HF Staff commited on
Commit
fdfafe5
·
1 Parent(s): 0203fca

Apply Ruff

Browse files
yourbench_space/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
- from pathlib import Path
2
  import os
 
 
3
 
4
  PATH = Path("/home/user/app") if os.environ.get("SYSTEM") == "spaces" else Path("app")
 
 
1
  import os
2
+ from pathlib import Path
3
+
4
 
5
  PATH = Path("/home/user/app") if os.environ.get("SYSTEM") == "spaces" else Path("app")
yourbench_space/app.py CHANGED
@@ -9,17 +9,16 @@ from loguru import logger
9
 
10
  import gradio as gr
11
  from datasets import load_dataset
12
- from huggingface_hub import whoami, HfApi
13
  from yourbench_space import PATH
14
  from yourbench_space.utils import (
15
- STAGE_DISPLAY_MAP,
16
  STAGES,
17
  SubprocessManagerGroup,
18
  save_files,
19
- on_generation_succsess,
20
  update_dataset,
21
  map_stage_names,
22
  is_running_locally,
 
23
  )
24
  from yourbench_space.config import generate_and_save_config
25
  from yourbench_space.evaluation import run_evaluations, create_eval_file
@@ -235,11 +234,12 @@ def init_session(profile: gr.OAuthProfile | None):
235
  logger.info(f"Started session for {local_uuid}")
236
  return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
237
 
 
238
  btn_launch_evals = gr.Button(
239
- "🚀 Launch Evaluation",
240
  visible=True,
241
- interactive=False, # Start non-interactive
242
- variant="primary"
243
  )
244
 
245
  with gr.Blocks(theme=gr.themes.Default()) as app:
@@ -251,8 +251,12 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
251
  with gr.Tab("Choose Documents & Settings", id=0):
252
  with gr.Column():
253
  gr.Markdown("### 📄 Choose your documents and settings")
254
- gr.Markdown("Upload your source documents that will form the knowledge base for your benchmark. Set a Hugging Face organization and dataset name.")
255
- gr.Markdown("This step also generates a config file for running the benchmark pipeline. You can download it to run YourBench locally.")
 
 
 
 
256
 
257
  with gr.Row():
258
  with gr.Accordion("Hugging Face Settings"):
@@ -320,7 +324,9 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
320
  with gr.Tab("Run Benchmark Pipeline", id=1):
321
  with gr.Column():
322
  gr.Markdown("### ⚙️ Run the benchmark generation pipeline")
323
- gr.Markdown("Start the pipeline to process documents, generate questions, and build the private evaluation dataset. Watch logs, track progress, and preview the results.")
 
 
324
 
325
  with gr.Row():
326
  start_button = gr.Button("Start Task")
@@ -374,9 +380,9 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
374
  stages_table.change(
375
  on_generation_succsess,
376
  inputs=stages_table,
377
- outputs=[tabs,btn_launch_evals],
378
  )
379
-
380
  # TODO: this timer should only be active when the second tab is passed to active for the first time
381
  log_timer = gr.Timer(1.0, active=True)
382
  log_timer.tick(
@@ -388,7 +394,9 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
388
  with gr.Tab("Evaluate Models on Benchmark", id=2):
389
  with gr.Column():
390
  gr.Markdown("### 🧪 Evaluate models on your benchmark")
391
- gr.Markdown("Runs the evaluation with [Lighteval](https://github.com/huggingface/lighteval) on the resulted dataset using 5+ open models, then deploys a leaderboard as a Hugging Face Space under your org.")
 
 
392
 
393
  with gr.Row():
394
  with gr.Column():
@@ -406,7 +414,6 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
406
  )
407
  clear_status_btn.click(lambda: "", outputs=eval_status)
408
 
409
-
410
  app.load(init_session, outputs=session_state)
411
 
412
  app.launch(allowed_paths=[PATH])
 
9
 
10
  import gradio as gr
11
  from datasets import load_dataset
12
+ from huggingface_hub import HfApi, whoami
13
  from yourbench_space import PATH
14
  from yourbench_space.utils import (
 
15
  STAGES,
16
  SubprocessManagerGroup,
17
  save_files,
 
18
  update_dataset,
19
  map_stage_names,
20
  is_running_locally,
21
+ on_generation_succsess,
22
  )
23
  from yourbench_space.config import generate_and_save_config
24
  from yourbench_space.evaluation import run_evaluations, create_eval_file
 
234
  logger.info(f"Started session for {local_uuid}")
235
  return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
236
 
237
+
238
  btn_launch_evals = gr.Button(
239
+ "🚀 Launch Evaluation",
240
  visible=True,
241
+ interactive=True, # Start non-interactive
242
+ variant="primary",
243
  )
244
 
245
  with gr.Blocks(theme=gr.themes.Default()) as app:
 
251
  with gr.Tab("Choose Documents & Settings", id=0):
252
  with gr.Column():
253
  gr.Markdown("### 📄 Choose your documents and settings")
254
+ gr.Markdown(
255
+ "Upload your source documents that will form the knowledge base for your benchmark. Set a Hugging Face organization and dataset name."
256
+ )
257
+ gr.Markdown(
258
+ "This step also generates a config file for running the benchmark pipeline. You can download it to run YourBench locally."
259
+ )
260
 
261
  with gr.Row():
262
  with gr.Accordion("Hugging Face Settings"):
 
324
  with gr.Tab("Run Benchmark Pipeline", id=1):
325
  with gr.Column():
326
  gr.Markdown("### ⚙️ Run the benchmark generation pipeline")
327
+ gr.Markdown(
328
+ "Start the pipeline to process documents, generate questions, and build the private evaluation dataset. Watch logs, track progress, and preview the results."
329
+ )
330
 
331
  with gr.Row():
332
  start_button = gr.Button("Start Task")
 
380
  stages_table.change(
381
  on_generation_succsess,
382
  inputs=stages_table,
383
+ outputs=[tabs, btn_launch_evals],
384
  )
385
+
386
  # TODO: this timer should only be active when the second tab is passed to active for the first time
387
  log_timer = gr.Timer(1.0, active=True)
388
  log_timer.tick(
 
394
  with gr.Tab("Evaluate Models on Benchmark", id=2):
395
  with gr.Column():
396
  gr.Markdown("### 🧪 Evaluate models on your benchmark")
397
+ gr.Markdown(
398
+ "Runs the evaluation with [Lighteval](https://github.com/huggingface/lighteval) on the resulted dataset using 5+ open models, then deploys a leaderboard as a Hugging Face Space under your org."
399
+ )
400
 
401
  with gr.Row():
402
  with gr.Column():
 
414
  )
415
  clear_status_btn.click(lambda: "", outputs=eval_status)
416
 
 
417
  app.load(init_session, outputs=session_state)
418
 
419
  app.launch(allowed_paths=[PATH])
yourbench_space/evaluation.py CHANGED
@@ -1,13 +1,15 @@
1
  import os
2
- import subprocess
3
  import asyncio
 
4
  from pathlib import Path
5
 
6
- from yourbench_space.leaderboard_space.env import INIT_MODELS
7
  from loguru import logger
8
 
 
 
 
9
  ON_SPACES = os.environ.get("system") == "spaces"
10
- OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder
11
 
12
 
13
  def create_eval_file(eval_ds_name: str):
@@ -15,6 +17,7 @@ def create_eval_file(eval_ds_name: str):
15
  template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
16
  subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
17
 
 
18
  async def run_process(args: list) -> dict:
19
  process = await asyncio.create_subprocess_exec(
20
  *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
 
1
  import os
 
2
  import asyncio
3
+ import subprocess
4
  from pathlib import Path
5
 
 
6
  from loguru import logger
7
 
8
+ from yourbench_space.leaderboard_space.env import INIT_MODELS
9
+
10
+
11
  ON_SPACES = os.environ.get("system") == "spaces"
12
+ OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder
13
 
14
 
15
  def create_eval_file(eval_ds_name: str):
 
17
  template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
18
  subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
19
 
20
+
21
  async def run_process(args: list) -> dict:
22
  process = await asyncio.create_subprocess_exec(
23
  *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
yourbench_space/leaderboard_space/app.py CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
7
  with gr.Blocks(
8
  title="YourBench Leaderboard",
9
  css="button { margin: 0 10px; padding: 5px 15px; }",
10
- ) as demo:
11
  # DISPLAY TABLE AND ANALYSIS
12
  title = gr.Markdown(f"YourBench auto-Leaderboard for {TASK}")
13
  leaderboard = gr.DataFrame(label="Results", interactive=False)
@@ -21,6 +21,6 @@ with gr.Blocks(
21
 
22
  samples_ix.change(update_examples, samples_ix, [easy_samples, hard_samples, all_samples])
23
 
24
- demo.load(run_pipeline, [samples_ix], [leaderboard, easy_samples, hard_samples, all_samples])
25
 
26
- demo.launch()
 
7
  with gr.Blocks(
8
  title="YourBench Leaderboard",
9
  css="button { margin: 0 10px; padding: 5px 15px; }",
10
+ ) as app:
11
  # DISPLAY TABLE AND ANALYSIS
12
  title = gr.Markdown(f"YourBench auto-Leaderboard for {TASK}")
13
  leaderboard = gr.DataFrame(label="Results", interactive=False)
 
21
 
22
  samples_ix.change(update_examples, samples_ix, [easy_samples, hard_samples, all_samples])
23
 
24
+ app.load(run_pipeline, [samples_ix], [leaderboard, easy_samples, hard_samples, all_samples])
25
 
26
+ app.launch()
yourbench_space/lighteval_task/yourbench_task.py CHANGED
@@ -21,21 +21,20 @@
21
  # SOFTWARE.
22
 
23
 
24
- import logging
25
  import re
 
26
 
27
  import numpy as np
28
  from aenum import extend_enum
29
-
30
  from lighteval.metrics.metrics import Metrics
 
31
  from lighteval.metrics.metrics_sample import JudgeLLM
32
  from lighteval.metrics.utils.metric_utils import (
33
- CorpusLevelMetricGrouping,
34
- MetricCategory,
35
  MetricUseCase,
 
 
36
  )
37
- from lighteval.tasks.lighteval_task import LightevalTaskConfig
38
- from lighteval.tasks.requests import Doc
39
 
40
 
41
  logger = logging.getLogger(__name__)
@@ -186,7 +185,6 @@ class JudgeLLMYourBench(JudgeLLM):
186
  max_tokens=2048,
187
  )
188
 
189
-
190
  def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
191
  # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
192
  questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
@@ -202,11 +200,9 @@ class JudgeLLMYourBench(JudgeLLM):
202
 
203
  metrics = []
204
  for i in range(len(sample_ids)):
205
- metrics.append(
206
- {
207
- "accuracy": score[i],
208
- }
209
- )
210
 
211
  return metrics
212
 
 
21
  # SOFTWARE.
22
 
23
 
 
24
  import re
25
+ import logging
26
 
27
  import numpy as np
28
  from aenum import extend_enum
29
+ from lighteval.tasks.requests import Doc
30
  from lighteval.metrics.metrics import Metrics
31
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
32
  from lighteval.metrics.metrics_sample import JudgeLLM
33
  from lighteval.metrics.utils.metric_utils import (
 
 
34
  MetricUseCase,
35
+ MetricCategory,
36
+ CorpusLevelMetricGrouping,
37
  )
 
 
38
 
39
 
40
  logger = logging.getLogger(__name__)
 
185
  max_tokens=2048,
186
  )
187
 
 
188
  def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
189
  # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
190
  questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
 
200
 
201
  metrics = []
202
  for i in range(len(sample_ids)):
203
+ metrics.append({
204
+ "accuracy": score[i],
205
+ })
 
 
206
 
207
  return metrics
208
 
yourbench_space/utils.py CHANGED
@@ -11,7 +11,6 @@ from loguru import logger
11
 
12
  import gradio as gr
13
  from datasets import load_dataset
14
-
15
  from yourbench_space import PATH
16
 
17
 
@@ -129,11 +128,13 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
129
 
130
  return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
131
 
 
132
  def should_enable_eval_tab(stages):
133
  logger.info(f"Stages received: {stages}")
134
  logger.info(f"Lighteval stage name: {STAGE_DISPLAY_MAP['lighteval']}")
135
  return STAGE_DISPLAY_MAP["lighteval"] in stages
136
 
 
137
  def on_generation_succsess(stages):
138
  stages = stages or []
139
  if STAGE_DISPLAY_MAP["lighteval"] in stages:
@@ -141,6 +142,7 @@ def on_generation_succsess(stages):
141
  return gr.update(selected=2), gr.update(interactive=True, visible=True)
142
  return gr.update(), gr.update(interactive=False, visible=True)
143
 
 
144
  class SubprocessManagerGroup:
145
  """Instanciates one manager per user (should be used as a singleton class)"""
146
 
 
11
 
12
  import gradio as gr
13
  from datasets import load_dataset
 
14
  from yourbench_space import PATH
15
 
16
 
 
128
 
129
  return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
130
 
131
+
132
  def should_enable_eval_tab(stages):
133
  logger.info(f"Stages received: {stages}")
134
  logger.info(f"Lighteval stage name: {STAGE_DISPLAY_MAP['lighteval']}")
135
  return STAGE_DISPLAY_MAP["lighteval"] in stages
136
 
137
+
138
  def on_generation_succsess(stages):
139
  stages = stages or []
140
  if STAGE_DISPLAY_MAP["lighteval"] in stages:
 
142
  return gr.update(selected=2), gr.update(interactive=True, visible=True)
143
  return gr.update(), gr.update(interactive=False, visible=True)
144
 
145
+
146
  class SubprocessManagerGroup:
147
  """Instanciates one manager per user (should be used as a singleton class)"""
148