alozowski HF Staff commited on
Commit
baf4cf2
·
1 Parent(s): 18a3d4c

Fix enumaration in yourbench_task.py

Browse files
yourbench_space/lighteval_task/yourbench_task.py CHANGED
@@ -56,10 +56,10 @@ JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a docum
56
  4. **Ground Truth Answer Understanding**:
57
  - Understand the provided ground truth answer, identifying its key points.
58
 
59
- 6. **Answer Understanding**:
60
  - Examine the Model Answer, identifying key points and assessing accuracy and factuality.
61
 
62
- 7. **Final Answer**:
63
  - 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
64
 
65
  # Output Format
@@ -151,7 +151,7 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
151
  chunk = kwargs.get("chunks", "")
152
  summary = kwargs.get("documents", "")
153
 
154
- return [
155
  {"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
156
  {
157
  "role": "user",
@@ -161,6 +161,8 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
161
  },
162
  ]
163
 
 
 
164
 
165
  def process_judge_response_yourbench(response):
166
  # extract the final answer using regex from the response xml
@@ -175,13 +177,16 @@ def process_judge_response_yourbench(response):
175
  class JudgeLLMYourBench(JudgeLLM):
176
  def __init__(self):
177
  super().__init__(
178
- judge_model_name="gpt-4o-2024-08-06",
179
  template=get_judge_prompt,
180
  process_judge_response=process_judge_response_yourbench,
181
- judge_backend="openai",
182
  short_judge_name="yourbench_judge",
 
 
183
  )
184
 
 
185
  def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
186
  # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
187
  questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
 
56
  4. **Ground Truth Answer Understanding**:
57
  - Understand the provided ground truth answer, identifying its key points.
58
 
59
+ 5. **Answer Understanding**:
60
  - Examine the Model Answer, identifying key points and assessing accuracy and factuality.
61
 
62
+ 6. **Final Answer**:
63
  - 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
64
 
65
  # Output Format
 
151
  chunk = kwargs.get("chunks", "")
152
  summary = kwargs.get("documents", "")
153
 
154
+ prompt = [
155
  {"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
156
  {
157
  "role": "user",
 
161
  },
162
  ]
163
 
164
+ return prompt
165
+
166
 
167
  def process_judge_response_yourbench(response):
168
  # extract the final answer using regex from the response xml
 
177
  class JudgeLLMYourBench(JudgeLLM):
178
  def __init__(self):
179
  super().__init__(
180
+ judge_model_name="Qwen/QwQ-32B",
181
  template=get_judge_prompt,
182
  process_judge_response=process_judge_response_yourbench,
183
+ judge_backend="hf-inference",
184
  short_judge_name="yourbench_judge",
185
+ hf_provider="novita",
186
+ max_tokens=2048,
187
  )
188
 
189
+
190
  def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
191
  # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
192
  questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]