Minseok Bae commited on
Commit
f0b90cf
1 Parent(s): c3e9147

Edited README and added reproducibility functionality in main_backend.py

Browse files
Files changed (2) hide show
  1. main_backend.py +81 -49
  2. src/display/about.py +55 -4
main_backend.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import logging
2
  import pprint
3
 
@@ -22,55 +23,86 @@ snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
22
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
23
 
24
 
25
- def run_auto_eval():
26
- current_pending_status = [PENDING_STATUS]
27
-
28
- manage_requests.check_completed_evals(
29
- api=envs.API,
30
- checked_status=RUNNING_STATUS,
31
- completed_status=FINISHED_STATUS,
32
- failed_status=FAILED_STATUS,
33
- hf_repo=envs.QUEUE_REPO,
34
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
35
- hf_repo_results=envs.RESULTS_REPO,
36
- local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
37
- )
38
- logging.info("Checked completed evals")
39
- eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
40
- hf_repo=envs.QUEUE_REPO,
41
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
42
- logging.info("Got eval requests")
43
- eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
44
- logging.info("Sorted eval requests")
45
-
46
- print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
47
-
48
- if len(eval_requests) == 0:
49
- print("No eval requests found. Exiting.")
50
- return
51
-
52
- eval_request = eval_requests[0]
53
- pp.pprint(eval_request)
54
-
55
- manage_requests.set_eval_request(
56
- api=envs.API,
57
- eval_request=eval_request,
58
- new_status=RUNNING_STATUS,
59
- hf_repo=envs.QUEUE_REPO,
60
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
61
- )
62
- logging.info("Set eval request to running, now running eval")
63
-
64
- run_eval_suite.run_evaluation(
65
- eval_request=eval_request,
66
- local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
67
- results_repo=envs.RESULTS_REPO,
68
- batch_size=1,
69
- device=envs.DEVICE,
70
- no_cache=True,
71
- )
72
- logging.info("Eval finished, now setting status to finished")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  if __name__ == "__main__":
76
- run_auto_eval()
 
1
+ import argparse
2
  import logging
3
  import pprint
4
 
 
23
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
24
 
25
 
26
+ def run_auto_eval(args):
27
+ if not args.reproduce:
28
+ current_pending_status = [PENDING_STATUS]
29
+
30
+ manage_requests.check_completed_evals(
31
+ api=envs.API,
32
+ checked_status=RUNNING_STATUS,
33
+ completed_status=FINISHED_STATUS,
34
+ failed_status=FAILED_STATUS,
35
+ hf_repo=envs.QUEUE_REPO,
36
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
37
+ hf_repo_results=envs.RESULTS_REPO,
38
+ local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
39
+ )
40
+ logging.info("Checked completed evals")
41
+ eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
42
+ hf_repo=envs.QUEUE_REPO,
43
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
44
+ logging.info("Got eval requests")
45
+ eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
46
+ logging.info("Sorted eval requests")
47
+
48
+ print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
49
+
50
+ if len(eval_requests) == 0:
51
+ print("No eval requests found. Exiting.")
52
+ return
53
+
54
+ eval_request = eval_requests[0]
55
+ pp.pprint(eval_request)
56
+
57
+ manage_requests.set_eval_request(
58
+ api=envs.API,
59
+ eval_request=eval_request,
60
+ new_status=RUNNING_STATUS,
61
+ hf_repo=envs.QUEUE_REPO,
62
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
63
+ )
64
+ logging.info("Set eval request to running, now running eval")
65
+
66
+ run_eval_suite.run_evaluation(
67
+ eval_request=eval_request,
68
+ local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
69
+ results_repo=envs.RESULTS_REPO,
70
+ batch_size=1,
71
+ device=envs.DEVICE,
72
+ no_cache=True,
73
+ )
74
+ logging.info("Eval finished, now setting status to finished")
75
+ else:
76
+ eval_request = manage_requests.EvalRequest(
77
+ model=args.model,
78
+ status=PENDING_STATUS,
79
+ precision=args.precision
80
+ )
81
+ pp.pprint(eval_request)
82
+ logging.info("Running reproducibility eval")
83
+
84
+ run_eval_suite.run_evaluation(
85
+ eval_request=eval_request,
86
+ local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
87
+ results_repo=envs.RESULTS_REPO,
88
+ batch_size=1,
89
+ device=envs.DEVICE,
90
+ )
91
+ logging.info("Reproducibility eval finished")
92
+
93
+
94
+ def main():
95
+ parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
96
+
97
+ # Optional arguments
98
+ parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
99
+ parser.add_argument("--model", type=str, default=None, help="Your Model ID")
100
+ parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
101
+
102
+ args = parser.parse_args()
103
+
104
+ run_auto_eval(args)
105
 
106
 
107
  if __name__ == "__main__":
108
+ main()
src/display/about.py CHANGED
@@ -16,7 +16,6 @@ class Tasks(Enum):
16
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
17
  average_summary_length = Task("average_summary_length",
18
  "average_summary_length", "Average Summary Length")
19
- # error_rate = Task("error_rate", "error_rate", "Error Rate")
20
 
21
 
22
  # Your leaderboard name
@@ -44,12 +43,65 @@ We generate summaries for each of these documents using submitted LLMs and compu
44
  ## Understand each metric
45
  - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
46
  - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
47
- - Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
48
  - Average Summary Length: The average number of words in the generated summaries
49
 
 
 
 
 
50
  ## Reproducibility
51
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  """
54
 
55
  EVALUATION_QUEUE_TEXT = """
@@ -79,7 +131,6 @@ When we add extra information about models to the leaderboard, it will be automa
79
  ## In case of model failure
80
  If your model is displayed in the `FAILED` category, its execution stopped.
81
  Make sure you have followed the above steps first.
82
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
83
  """
84
 
85
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
16
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
17
  average_summary_length = Task("average_summary_length",
18
  "average_summary_length", "Average Summary Length")
 
19
 
20
 
21
  # Your leaderboard name
 
43
  ## Understand each metric
44
  - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
45
  - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
46
+ - Answer Rate: The percentage of summaries that are non-empty. This is either the model refuses to generate a response or throws an error due to various reasons. (e.g. the model believes that the document includes inappropriate content)
47
  - Average Summary Length: The average number of words in the generated summaries
48
 
49
+ ## Note on non-Hugging Face models
50
+ On H2EM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
51
+ If you would like to submit your model that is not available on the Hugging Face model hub, please contact us at [email protected].
52
+
53
  ## Reproducibility
54
+ ### For models not available on the Hugging Face model hub:
55
+ You can access the generated summaries we used to evaluate models [here](https://github.com/vectara/hallucination-leaderboard) under a file named "leaderboard_summaries.csv".
56
+ In the same Github repository, you can also find the prompt we used to generate the summaries under "Prompt Used" section in the README file.
57
+ For the models that are not available on the Hugging Face model hub, we also provide the information on how we ran the evaluations under "API Integration Details" section.
58
+
59
+ ### For models available on the Hugging Face model hub:
60
+ To reproduce the result for your model, you can follow the steps below:
61
+ - 1) Clone the repository
62
+ ```python
63
+ git lfs install
64
+ git clone https://huggingface.co/spaces/vectara/leaderboard
65
+ ```
66
+ - 2) Install the requirements
67
+ ```python
68
+ pip install -r requirements.txt
69
+ ```
70
+ - 3) Set up your Hugging Face token
71
+ ```python
72
+ export HF_TOKEN=your_token
73
+ ```
74
+ - 4) Run the evaluation script
75
+ ```python
76
+ python main_backend.py --model your_model_id --precision float16
77
+ ```
78
+ - 5) Wait for the evaluation to finish (it may take a while)
79
 
80
+ Once the evaluation is finished, a file named "results.json" will be generated under the folder "eval-results-bk/your_model_id".
81
+ The result has the following JSON format:
82
+ ```python
83
+ {
84
+ "config": {
85
+ "model_dtype": precision,
86
+ "model_name": your_model_id,
87
+ "model_sha": "main"
88
+ },
89
+ "results": {
90
+ "hallucination_rate": {
91
+ "hallucination_rate":
92
+ },
93
+ "factual_consistency_rate": {
94
+ "factual_consistency_rate":
95
+ },
96
+ "answer_rate": {
97
+ "answer_rate":
98
+ },
99
+ "average_summary_length": {
100
+ "average_summary_length":
101
+ }
102
+ }
103
+ }
104
+ ```
105
  """
106
 
107
  EVALUATION_QUEUE_TEXT = """
 
131
  ## In case of model failure
132
  If your model is displayed in the `FAILED` category, its execution stopped.
133
  Make sure you have followed the above steps first.
 
134
  """
135
 
136
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"