Update app.py
Browse files
app.py
CHANGED
@@ -11,14 +11,6 @@ data = {
|
|
11 |
"LLaVa-v1.6-Mistral (7B)",
|
12 |
"Gemini 1.5 Flash (MLLM)", "Gemini 2.0 Flash (MLLM)", "Gemini 1.5 Pro (MLLM)", "Gemini 2.0 Pro (MLLM)", "GPT-4o (MLLM)"
|
13 |
],
|
14 |
-
"Type": [
|
15 |
-
"LLM", "LLM", "LLM", "LLM", "LLM",
|
16 |
-
"LLM", "LLM", "LLM", "LLM",
|
17 |
-
"LLM", "LLM", "LLM", "LLM",
|
18 |
-
"LLM", "LLM", "LLM",
|
19 |
-
"MLLM",
|
20 |
-
"MLLM", "MLLM", "MLLM", "MLLM", "MLLM"
|
21 |
-
],
|
22 |
"T (Full)": [63.89, 64.91, 57.80, 63.37, 62.83, 49.94, 57.60, 56.06, 65.65, 56.94, 65.32, 56.90, 58.36, 62.78, 60.75, 57.43, None, 64.91, 64.79, 66.22, 66.42, 64.95],
|
23 |
"TA (Full)": [45.81, 47.10, 40.69, 45.51, 45.82, 34.78, 39.86, 41.01, 47.42, 40.66, 43.38, 38.20, 37.47, 43.25, 44.59, 37.64, None, 45.06, 40.07, 46.90, 46.17, 44.53],
|
24 |
"TAC (Full)": [21.29, 23.65, 19.08, 19.29, 22.67, 14.13, 16.49, 19.54, 19.65, 19.75, 22.54, 18.97, 19.20, 21.26, 18.63, 15.35, None, 20.66, 19.74, 23.23, 18.25, 19.60],
|
@@ -27,70 +19,16 @@ data = {
|
|
27 |
"TAC (Segmented)": [21.36, 28.17, 21.81, 23.96, 25.38, 19.83, 19.56, 20.76, 22.77, 21.54, 24.60, 21.31, 21.43, 22.87, 24.20, 24.50, 3.30, 23.27, 23.52, 24.97, 25.21, 27.86]
|
28 |
}
|
29 |
|
|
|
30 |
df = pd.DataFrame(data)
|
31 |
|
32 |
-
def
|
33 |
-
|
34 |
-
if view == "Segmented":
|
35 |
-
columns += ["T (Segmented)", "TA (Segmented)", "TAC (Segmented)"]
|
36 |
-
else:
|
37 |
-
columns += ["T (Full)", "TA (Full)", "TAC (Full)"]
|
38 |
-
|
39 |
-
if model_type == "All":
|
40 |
-
return df[columns].sort_values(by=columns[-1], ascending=False)
|
41 |
-
else:
|
42 |
-
return df[df["Type"] == model_type][columns].sort_values(by=columns[-1], ascending=False)
|
43 |
|
|
|
44 |
with gr.Blocks(title="VideoConviction LLM Leaderboard") as demo:
|
45 |
-
gr.Markdown("#
|
46 |
-
gr.Markdown(""
|
47 |
-
|
48 |
-
|
49 |
-
---
|
50 |
-
|
51 |
-
### π About the Benchmark
|
52 |
-
**VideoConviction** is a multimodal benchmark designed to evaluate **human conviction** and **stock market recommendation quality** from video content. It integrates vision, audio, and text to evaluate reasoning and persuasion detection in financial decision-making.
|
53 |
-
|
54 |
-
π **Paper**: [VideoConviction: A Multimodal Benchmark for Human Conviction and Stock Market Recommendations](https://doi.org/10.1145/3711896.3737417)
|
55 |
-
π **Conference**: ACM SIGKDD 2025
|
56 |
-
π₯ **Authors**: Michael Galarnyk, Veer Kejriwal, Agam Shah, Yash Bhardwaj, Nicholas Watney Meyer, Anand Krishnan, Sudheer Chava
|
57 |
-
|
58 |
-
---
|
59 |
-
|
60 |
-
### π Citation
|
61 |
-
```bibtex
|
62 |
-
@inproceedings{galarnyk2025videoconviction,
|
63 |
-
author = {Michael Galarnyk and Veer Kejriwal and Agam Shah and Yash Bhardwaj and Nicholas Watney Meyer and Anand Krishnan and Sudheer Chava},
|
64 |
-
title = {VideoConviction: A Multimodal Benchmark for Human Conviction and Stock Market Recommendations},
|
65 |
-
booktitle = {Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2 (KDD '25)},
|
66 |
-
year = {2025},
|
67 |
-
location = {Toronto, ON, Canada},
|
68 |
-
pages = {12},
|
69 |
-
publisher = {ACM},
|
70 |
-
doi = {10.1145/3711896.3737417}
|
71 |
-
}
|
72 |
-
```""")
|
73 |
-
|
74 |
-
with gr.Row():
|
75 |
-
with gr.Column():
|
76 |
-
view_dropdown = gr.Dropdown(
|
77 |
-
label="Select View Type", choices=["Segmented", "Full"], value="Segmented"
|
78 |
-
)
|
79 |
-
with gr.Column():
|
80 |
-
model_dropdown = gr.Dropdown(
|
81 |
-
label="Filter by Model Type", choices=["All", "LLM", "MLLM"], value="All"
|
82 |
-
)
|
83 |
-
|
84 |
-
leaderboard = gr.Dataframe(
|
85 |
-
value=filter_leaderboard("Segmented", "All"),
|
86 |
-
label="Leaderboard",
|
87 |
-
interactive=False
|
88 |
-
)
|
89 |
-
|
90 |
-
def update_leaderboard(view, model_type):
|
91 |
-
return filter_leaderboard(view, model_type)
|
92 |
-
|
93 |
-
view_dropdown.change(fn=update_leaderboard, inputs=[view_dropdown, model_dropdown], outputs=leaderboard)
|
94 |
-
model_dropdown.change(fn=update_leaderboard, inputs=[view_dropdown, model_dropdown], outputs=leaderboard)
|
95 |
|
96 |
demo.launch()
|
|
|
11 |
"LLaVa-v1.6-Mistral (7B)",
|
12 |
"Gemini 1.5 Flash (MLLM)", "Gemini 2.0 Flash (MLLM)", "Gemini 1.5 Pro (MLLM)", "Gemini 2.0 Pro (MLLM)", "GPT-4o (MLLM)"
|
13 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"T (Full)": [63.89, 64.91, 57.80, 63.37, 62.83, 49.94, 57.60, 56.06, 65.65, 56.94, 65.32, 56.90, 58.36, 62.78, 60.75, 57.43, None, 64.91, 64.79, 66.22, 66.42, 64.95],
|
15 |
"TA (Full)": [45.81, 47.10, 40.69, 45.51, 45.82, 34.78, 39.86, 41.01, 47.42, 40.66, 43.38, 38.20, 37.47, 43.25, 44.59, 37.64, None, 45.06, 40.07, 46.90, 46.17, 44.53],
|
16 |
"TAC (Full)": [21.29, 23.65, 19.08, 19.29, 22.67, 14.13, 16.49, 19.54, 19.65, 19.75, 22.54, 18.97, 19.20, 21.26, 18.63, 15.35, None, 20.66, 19.74, 23.23, 18.25, 19.60],
|
|
|
19 |
"TAC (Segmented)": [21.36, 28.17, 21.81, 23.96, 25.38, 19.83, 19.56, 20.76, 22.77, 21.54, 24.60, 21.31, 21.43, 22.87, 24.20, 24.50, 3.30, 23.27, 23.52, 24.97, 25.21, 27.86]
|
20 |
}
|
21 |
|
22 |
+
# Create DataFrame
|
23 |
df = pd.DataFrame(data)
|
24 |
|
25 |
+
def display_leaderboard():
|
26 |
+
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
# Create a simple Gradio interface
|
29 |
with gr.Blocks(title="VideoConviction LLM Leaderboard") as demo:
|
30 |
+
gr.Markdown("# VideoConviction Benchmark Leaderboard")
|
31 |
+
gr.Markdown("This leaderboard shows the F1 scores of various LLMs and MLLMs across the VideoConviction benchmark tasks.")
|
32 |
+
gr.Dataframe(display_leaderboard, interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
demo.launch()
|