Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Fix enumaration in yourbench_task.py
Browse files
yourbench_space/lighteval_task/yourbench_task.py
CHANGED
@@ -56,10 +56,10 @@ JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a docum
|
|
56 |
4. **Ground Truth Answer Understanding**:
|
57 |
- Understand the provided ground truth answer, identifying its key points.
|
58 |
|
59 |
-
|
60 |
- Examine the Model Answer, identifying key points and assessing accuracy and factuality.
|
61 |
|
62 |
-
|
63 |
- 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
|
64 |
|
65 |
# Output Format
|
@@ -151,7 +151,7 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
|
|
151 |
chunk = kwargs.get("chunks", "")
|
152 |
summary = kwargs.get("documents", "")
|
153 |
|
154 |
-
|
155 |
{"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
|
156 |
{
|
157 |
"role": "user",
|
@@ -161,6 +161,8 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
|
|
161 |
},
|
162 |
]
|
163 |
|
|
|
|
|
164 |
|
165 |
def process_judge_response_yourbench(response):
|
166 |
# extract the final answer using regex from the response xml
|
@@ -175,13 +177,16 @@ def process_judge_response_yourbench(response):
|
|
175 |
class JudgeLLMYourBench(JudgeLLM):
|
176 |
def __init__(self):
|
177 |
super().__init__(
|
178 |
-
judge_model_name="
|
179 |
template=get_judge_prompt,
|
180 |
process_judge_response=process_judge_response_yourbench,
|
181 |
-
judge_backend="
|
182 |
short_judge_name="yourbench_judge",
|
|
|
|
|
183 |
)
|
184 |
|
|
|
185 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
186 |
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
|
187 |
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
|
|
|
56 |
4. **Ground Truth Answer Understanding**:
|
57 |
- Understand the provided ground truth answer, identifying its key points.
|
58 |
|
59 |
+
5. **Answer Understanding**:
|
60 |
- Examine the Model Answer, identifying key points and assessing accuracy and factuality.
|
61 |
|
62 |
+
6. **Final Answer**:
|
63 |
- 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
|
64 |
|
65 |
# Output Format
|
|
|
151 |
chunk = kwargs.get("chunks", "")
|
152 |
summary = kwargs.get("documents", "")
|
153 |
|
154 |
+
prompt = [
|
155 |
{"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
|
156 |
{
|
157 |
"role": "user",
|
|
|
161 |
},
|
162 |
]
|
163 |
|
164 |
+
return prompt
|
165 |
+
|
166 |
|
167 |
def process_judge_response_yourbench(response):
|
168 |
# extract the final answer using regex from the response xml
|
|
|
177 |
class JudgeLLMYourBench(JudgeLLM):
|
178 |
def __init__(self):
|
179 |
super().__init__(
|
180 |
+
judge_model_name="Qwen/QwQ-32B",
|
181 |
template=get_judge_prompt,
|
182 |
process_judge_response=process_judge_response_yourbench,
|
183 |
+
judge_backend="hf-inference",
|
184 |
short_judge_name="yourbench_judge",
|
185 |
+
hf_provider="novita",
|
186 |
+
max_tokens=2048,
|
187 |
)
|
188 |
|
189 |
+
|
190 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
191 |
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
|
192 |
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
|