File size: 2,502 Bytes
034ac91
 
 
3955552
034ac91
 
 
3955552
 
 
 
 
9368d51
034ac91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e275da7
034ac91
 
 
7272259
 
034ac91
 
7272259
 
034ac91
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
TITLE = """# 🏅 DABStep Leaderboard"""

INTRODUCTION_TEXT = """
The [Data Agent Benchmark for Multi-step Reasoning (DABStep)](https://huggingface.co/blog/dabstep) is looking to measure and push the state-of-the-art in Data Analysis by LLMs.
The benchmark is composed of ~450 data analysis questions ([Dataset Link](https://huggingface.co/datasets/adyen/data-agents-benchmark)) centered around 1 or more documents that agents will have to understand and cross reference in order to answer correctly.

We have set up a notebook to quickly get an agent baseline using the free Huggingface Inference API: [Colab Notebook](https://colab.research.google.com/drive/1pXi5ffBFNJQ5nn1111SnIfjfKCOlunxu)

Check out the official technical reports here:
[Adyen Report](https://www.adyen.com/knowledge-hub/data-agent-benchmark-for-multi-step-reasoning-dabstep)
[Hugging Face Report](https://huggingface.co/blog/dabstep)

Join the discussion on the [discord server!](https://discord.gg/D4wAymBD)
"""

SUBMISSION_TEXT = """
## Submissions
Scores are expressed as the percentage of correct answers. 

Each question calls for an answer that is either a string (one or a few words), a number, or a comma separated list of strings or floats, unless specified otherwise. There is only one correct answer. 
Hence, evaluation is done via quasi exact match between a model’s answer and the ground truth (up to some normalization that is tied to the “type” of the ground truth).


We expect submissions to be json-line files with the following format. 
Mandatory fields are: `task_id` and `agent_answer`. However, `reasoning_trace` is optional:
```
{"task_id": "task_id_1", "agent_answer": "Answer 1 from your agent", "reasoning_trace": "The different steps by which your model reached answer 1"}
{"task_id": "task_id_2", "agent_answer": "Answer 2 from your agent", "reasoning_trace": "The different steps by which your model reached answer 2"}
```

Our scoring function can be found [here](https://huggingface.co/spaces/adyen/data-agents-benchmark/blob/main/dabstep_benchmark/evaluation/scorer.py).
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{DABstep_benchmark_2025,
      title={Data Agent Benchmark for Multi-step Reasoning (🕺DABstep)}, 
      author={Martin Iglesias, Alex Egg, Friso Kingma},
      year={2025},
      month={February},
      url={https://www.adyen.com/knowledge-hub/data-agent-benchmark-for-multi-step-reasoning-dabstep}
}"""