Clémentine commited on
Commit
0c0a603
1 Parent(s): 412f8e5

change to lighteval's latest

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
requirements.txt CHANGED
@@ -9,7 +9,7 @@ accelerate>=0.26.0
9
  sentencepiece
10
 
11
  # Evaluation suites
12
- lighteval
13
  lm_eval==0.4.3
14
 
15
  # Log Visualizer
 
9
  sentencepiece
10
 
11
  # Evaluation suites
12
+ lighteval>=0.5.0
13
  lm_eval==0.4.3
14
 
15
  # Log Visualizer
src/backend/run_eval_suite_lighteval.py CHANGED
@@ -3,7 +3,12 @@ import argparse
3
  import logging
4
  from datetime import datetime
5
 
6
- from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
 
 
 
 
 
7
 
8
  from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
9
  from src.backend.manage_requests import EvalRequest
@@ -32,57 +37,55 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
32
  if limit:
33
  logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
34
 
35
- args_dict = {
36
- # Endpoint parameters
37
- "endpoint_model_name":eval_request.model,
38
- "accelerator": accelerator,
39
- "vendor": vendor,
40
- "region": region,
41
- "instance_size": instance_size,
42
- "instance_type": instance_type,
43
- "reuse_existing": False,
44
- "model_dtype": eval_request.precision,
45
- "revision": eval_request.revision,
46
- # Save parameters
47
- "push_results_to_hub": True,
48
- "save_details": True,
49
- "push_details_to_hub": True,
50
- "public_run": False,
51
- "cache_dir": CACHE_PATH,
52
- "results_org": RESULTS_REPO,
53
- "output_dir": local_dir,
54
- "job_id": str(datetime.now()),
55
- # Experiment parameters
56
- "override_batch_size": batch_size,
57
- "custom_tasks": "custom_tasks.py",
58
- "tasks": task_names,
59
- "max_samples": limit,
60
- "use_chat_template": False,
61
- "system_prompt": None,
62
- # Parameters which would be set to things by the kwargs if actually using argparse
63
- "inference_server_address": None,
64
- "model_args": None,
65
- "num_fewshot_seeds": None,
66
- "delta_weights": False,
67
- "adapter_weights": False
68
- }
69
- args = argparse.Namespace(**args_dict)
70
 
71
- try:
72
- results = main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- results["config"]["model_dtype"] = eval_request.precision
75
- results["config"]["model_name"] = eval_request.model
76
- results["config"]["model_sha"] = eval_request.revision
 
 
77
 
78
  dumped = json.dumps(results, indent=2)
79
  logger.info(dumped)
80
- except Exception as e: # if eval failed, we force a cleanup
81
- env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
82
-
83
- model_config = create_model_config(args=args, accelerator=accelerator)
84
- model, _ = load_model(config=model_config, env_config=env_config)
85
- model.cleanup()
86
 
 
 
87
 
88
  return results
 
3
  import logging
4
  from datetime import datetime
5
 
6
+ import lighteval
7
+ from lighteval.logging.evaluation_tracker import EvaluationTracker
8
+ from lighteval.models.model_config import InferenceEndpointModelConfig
9
+ from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
10
+
11
+ from lighteval.main_accelerate import main, EnvConfig, create_model_config
12
 
13
  from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
14
  from src.backend.manage_requests import EvalRequest
 
37
  if limit:
38
  logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
39
 
40
+ evaluation_tracker = EvaluationTracker(
41
+ output_dir="./results",
42
+ save_details = True,
43
+ push_to_hub = True,
44
+ push_to_tensorboard = False,
45
+ hub_results_org= RESULTS_REPO,
46
+ public = False,
47
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ pipeline_params = PipelineParameters(
50
+ launcher_type=ParallelismManager.ACCELERATE,
51
+ override_batch_size=batch_size,
52
+ max_samples=limit,
53
+ use_chat_template=False,
54
+ system_prompt=None,
55
+ custom_tasks_directory="custom_tasks.py", # if using a custom task
56
+ )
57
+
58
+ model_config = InferenceEndpointModelConfig(
59
+ # Endpoint parameters
60
+ name = eval_request.model.replace(".", "-").lower(),
61
+ repository = eval_request.model,
62
+ accelerator = accelerator,
63
+ vendor= vendor,
64
+ region= region,
65
+ instance_size= instance_size,
66
+ instance_type= instance_type,
67
+ should_reuse_existing= False,
68
+ model_dtype= eval_request.precision,
69
+ revision= eval_request.revision,
70
+ )
71
+
72
+ pipeline = Pipeline(
73
+ tasks=task_names,
74
+ pipeline_parameters=pipeline_params,
75
+ evaluation_tracker=evaluation_tracker,
76
+ model_config=model_config,
77
+ )
78
 
79
+ try:
80
+ pipeline.evaluate()
81
+ pipeline.show_results()
82
+ pipeline.save_and_push_results()
83
+ results = pipeline.get_results()
84
 
85
  dumped = json.dumps(results, indent=2)
86
  logger.info(dumped)
 
 
 
 
 
 
87
 
88
+ except Exception as e: # if eval failed, we force a cleanup
89
+ pipeline.model.cleanup()
90
 
91
  return results