n0w0f commited on
Commit
2dc7f0b
·
1 Parent(s): 158203a

feat: update about session

Browse files
.gitignore CHANGED
@@ -11,3 +11,5 @@ eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
 
 
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
14
+
15
+
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  import time
3
  from pathlib import Path
4
 
@@ -23,6 +24,8 @@ from src.submission.check_validity import (
23
  )
24
  from src.submission.submit import update_dataset_with_scores
25
 
 
 
26
  # Global state for leaderboard data
27
  current_leaderboard_df = None
28
 
@@ -102,6 +105,7 @@ def process_submission(
102
 
103
  # Create the Gradio interface
104
  demo = gr.Blocks().queue()
 
105
 
106
  with demo:
107
  gr.HTML(TITLE)
 
1
  import json
2
+ import os
3
  import time
4
  from pathlib import Path
5
 
 
24
  )
25
  from src.submission.submit import update_dataset_with_scores
26
 
27
+ STATIC_DIR = str(Path(__file__).parent / "src" / "static")
28
+
29
  # Global state for leaderboard data
30
  current_leaderboard_df = None
31
 
 
105
 
106
  # Create the Gradio interface
107
  demo = gr.Blocks().queue()
108
+ demo.static_dir = STATIC_DIR
109
 
110
  with demo:
111
  gr.HTML(TITLE)
src/about.py CHANGED
@@ -14,51 +14,71 @@ NUM_FEWSHOT = 0 # Change with your few shot
14
 
15
 
16
  # Your leaderboard name
17
- TITLE = """<h1 align="center">🧪 ChemBench Leaderboard</h1>"""
18
 
19
- TITLE_MARKDOWN_DESCRIPTION = """Welcome to ChemBench, a comprehensive benchmark for evaluating language models on Chemical Reasoning, Knowledge and Intuition."""
20
 
21
  ABOUT_TEXT = """
22
- # Welcome to *ChemBench* 🧪
23
- *ChemBench* is a comprehensive toolkit for benchmarking language and multimodal models in chemistry tasks. *ChemBench* is designed to be modular and extensible, allowing users to easily add new datasets, models, and evaluation metrics.
24
- ## Key Features 🔑
25
- - **Standardized Benchmarking**: Built on the proven Big Bench format
26
- - **Universal Model Support**: Seamless integration with LiteLLM backend
27
- - **Scientifically sound parsing and evaluation**: *ChemBench* goes beyond simple MCQ evaluation, supports floating point answers (also in scientific notation). We also support different extraction techniques, including LLM-based extraction.
28
- - **Extensible**: *ChemBench* is designed to be easily extensible. Prompts, models, extraction, and evaluation methods can be easily added and customized via a pipeline system.
29
- - **Easy to use**: *ChemBench* is designed to be easy to use. We provide a simple interface you can use to get started testing your models with just a few lines of code.
30
- - **Caching and parallelization**: *ChemBench* can cache results and can parallelize inference to speed up benchmarking.
31
- - **Refusal detection**: *ChemBench* can detect when a model refuses to answer a question, handle it, and report it in the final results.
32
- ## Getting Started 🚀
33
- Start benchmarking your models with just a few lines of code. Here's how you can get started:
 
 
34
  ```python
35
- from chembench.evaluate import ChemBenchmark
36
- from chembench.prompter import PrompterBuilder
37
- from chembench.utilis import enable_logging
 
 
 
38
  from dotenv import load_dotenv
39
- load_dotenv(".env")
 
 
 
 
40
  enable_logging()
41
- benchmark = ChemBenchmark.from_huggingface()
42
- model = "openai/gpt-4"
 
43
  prompter = PrompterBuilder.from_model_object(
44
- model=model,
45
  )
46
- results = benchmark.bench(
47
- prompter,
48
- )
49
- benchmark.submit(results)
 
 
 
 
50
  ```
51
- For more details, check out our [documentation](https://lamalab-org.github.io/chembench/getting_started/)
 
 
 
52
  ## Scientific Foundation 📚
53
- *ChemBench* has been validated through peer review and extensive testing. For detailed methodology, check our papers:
 
54
  - [*ChemBench* Paper](https://arxiv.org/abs/2404.01475)
55
- - [Multimodal Paper](https://arxiv.org/pdf/2411.16955)
 
56
  Ready to elevate your chemistry AI research? 🧬
57
- *ChemBench* is open-source software licensed under the MIT license.
 
58
  Its development is led by the [Lab for AI for Materials (LamaLab)](https://jablonkagroup.uni-jena.de/) at the [University of Jena](https://www.uni-jena.de/) and the [Helmholtz Institute for Polymers in Energy Applications Jena](https://www.hipole-jena.de/).
59
  ## Connect With Us 🔗
60
  <div class="social-links">
61
- <a href="https://github.com/lamalab-org/chembench.git" target="_blank">
62
  <img src="https://raw.githubusercontent.com/FortAwesome/Font-Awesome/master/svgs/brands/github.svg" width="20" height="20" alt="GitHub"/> GitHub
63
  </a>
64
  <a href="https://twitter.com/jablonkagroup" target="_blank">
@@ -93,10 +113,13 @@ Its development is led by the [Lab for AI for Materials (LamaLab)](https://jablo
93
 
94
 
95
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
96
- CITATION_BUTTON_TEXT = r"""@article{mirza2024large,
97
- title = {Are large language models superhuman chemists?},
98
- author = {Adrian Mirza and Nawaf Alampara and Sreekanth Kunchapu and Benedict Emoekabu and Aswanth Krishnan and Mara Wilhelmi and Macjonathan Okereke and Juliane Eberhardt and Amir Mohammad Elahi and Maximilian Greiner and Caroline T. Holick and Tanya Gupta and Mehrdad Asgari and Christina Glaubitz and Lea C. Klepsch and Yannik Köster and Jakob Meyer and Santiago Miret and Tim Hoffmann and Fabian Alexander Kreth and Michael Ringleb and Nicole Roesner and Ulrich S. Schubert and Leanne M. Stafast and Dinga Wonanke and Michael Pieler and Philippe Schwaller and Kevin Maik Jablonka},
99
- year = {2024},
100
- journal = {arXiv preprint arXiv: 2404.01475}
 
 
 
101
  }
102
  """
 
14
 
15
 
16
  # Your leaderboard name
17
+ TITLE = """<h1 align="center">👩‍🔬 MaCBench Leaderboard </h1>"""
18
 
19
+ TITLE_MARKDOWN_DESCRIPTION = """Can VLMs assist in scientific discovery?"""
20
 
21
  ABOUT_TEXT = """
22
+ <h1 align="center">
23
+ Can VLMs assist in scientific discovery?
24
+ </h1>
25
+
26
+ *MaCBench* is a benchmark for multimodal language models in chemistry and materials research. We manually curated a benchmark of tasks across the entire scientific lifecycle from data extraction, experiment execution to data analysis.
27
+
28
+ This repository contains the reports and is used to collect new contributions.
29
+ The benchmark dataset itself is hosted on [HuggingFace](https://huggingface.co/datasets/jablonkagroup/MaCBench) and can be run using the [*ChemBench* benchmark engine](https://github.com/lamalab-org/chembench). You can find more details on how to do so in the [*ChemBench* documentation](https://lamalab-org.github.io/chembench/).
30
+
31
+
32
+ ## Running *MaCBench* Evaluation 🚀
33
+
34
+ *MaCBench* can be run using [*ChemBench*](https://github.com/lamalab-org/chembench) pipeline. For that, the only thing needed is to create the running script. The following script is an example of how to run the benchmark using the `gpt-4o` model.
35
+
36
  ```python
37
+ from chembench.evaluate import ChemBenchmark, save_topic_reports
38
+ from chembench.prompter import PrompterBuilder, PrompterPipeline
39
+ from chembench.utils import (
40
+ enable_caching,
41
+ enable_logging,
42
+ )
43
  from dotenv import load_dotenv
44
+
45
+ # Load environment variables
46
+ load_dotenv("../../.env", override=True)
47
+
48
+ # Enable logging and caching
49
  enable_logging()
50
+ enable_caching()
51
+
52
+ # Define the prompter object specifying the model and "multimodal_instruction" prompt type
53
  prompter = PrompterBuilder.from_model_object(
54
+ "openai/gpt-4o-2024-08-06", prompt_type="multimodal_instruction"
55
  )
56
+
57
+ # Load benchmark from HuggingFace
58
+ benchmark = ChemBenchmark.from_huggingface("jablonkagroup/MaCBench")
59
+
60
+ # Run benchmark with batch processing equal to 1
61
+ result = benchmark.bench(prompter, batch_size=1, model_kwargs={"temperature": 0})
62
+
63
+ benchmark.submit(results) # submit results to leaderboard
64
  ```
65
+
66
+
67
+ For more details, check out our [documentation](https://lamalab-org.github.io/chembench/getting_started/#how-to-benchmark-on-multi-modal-tasks)
68
+
69
  ## Scientific Foundation 📚
70
+ For detailed methodology, check our papers:
71
+ - [*MaCBench* Paper](https://arxiv.org/pdf/2411.16955)
72
  - [*ChemBench* Paper](https://arxiv.org/abs/2404.01475)
73
+
74
+
75
  Ready to elevate your chemistry AI research? 🧬
76
+
77
+ *MaCBench* is open-source software licensed under the MIT license.
78
  Its development is led by the [Lab for AI for Materials (LamaLab)](https://jablonkagroup.uni-jena.de/) at the [University of Jena](https://www.uni-jena.de/) and the [Helmholtz Institute for Polymers in Energy Applications Jena](https://www.hipole-jena.de/).
79
  ## Connect With Us 🔗
80
  <div class="social-links">
81
+ <a href="https://github.com/lamalab-org/macbench.git" target="_blank">
82
  <img src="https://raw.githubusercontent.com/FortAwesome/Font-Awesome/master/svgs/brands/github.svg" width="20" height="20" alt="GitHub"/> GitHub
83
  </a>
84
  <a href="https://twitter.com/jablonkagroup" target="_blank">
 
113
 
114
 
115
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
116
+ CITATION_BUTTON_TEXT = r"""@misc{alampara2024probinglimitationsmultimodallanguage,
117
+ title={Probing the limitations of multimodal language models for chemistry and materials research},
118
+ author={Nawaf Alampara and Mara Schilling-Wilhelmi and Martiño Ríos-García and Indrajeet Mandal and Pranav Khetarpal and Hargun Singh Grover and N. M. Anoop Krishnan and Kevin Maik Jablonka},
119
+ year={2024},
120
+ eprint={2411.16955},
121
+ archivePrefix={arXiv},
122
+ primaryClass={cs.LG},
123
+ url={https://arxiv.org/abs/2411.16955},
124
  }
125
  """
src/static/MacBench_logo.png ADDED
src/static/MacBench_logo_black_wbg.png ADDED
src/static/MacBench_logo_white_wbg.png ADDED
src/static/MacBench_white.svg ADDED