File size: 3,352 Bytes
ec9edcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84520b3
 
 
 
 
 
 
 
 
 
 
 
 
 
ec9edcd
 
 
 
 
5fa3b50
ec9edcd
84520b3
 
ec9edcd
 
 
 
 
 
 
84520b3
 
ec9edcd
 
 
 
 
 
ceb41f3
 
ec9edcd
 
 
84520b3
 
 
 
 
 
 
 
 
 
ec9edcd
fa7fc18
ec9edcd
5fa3b50
 
 
ec9edcd
 
 
84520b3
 
 
ec9edcd
 
 
 
84520b3
5fa3b50
 
 
 
 
ec9edcd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
import pandas as pd
import json

from evaluate import get_solution_code


# For now, only evaluate first 9 days
df = pd.read_csv("results.csv")

with open("solutions.json") as f:
    solutions = json.load(f)

def score_submissions(row):
    result = row["result"]
    day = row["day"]
    solution = solutions[str(day)]

    score_1 = solution[0] in result
    score_2 = solution[1] in result

    # if solution[0] == "N/A":
    #     score_1 = "N/A"
    # else:
    #     score_1 = solution[0] in result


    # if not score_1:
    #     score_2 = False  # Can't get to level 2 without level 1
    # elif solution[1] == "N/A":
    #     score_2 =  "N/A"  # Won't evaluate an answer i dont have
    # else:
    #     score_2 = solution[1] in result

    return [score_1, score_2]


df["scores"] = df.apply(score_submissions, axis=1)
df["code"] = df.apply(lambda x: get_solution_code(day = x["day"], model=x["model"]), axis=1)
df["code_md"] = df.code.apply(lambda x: "```python\n"+x+"\n```")

df["Runtime (s)"] = df["total_time"].apply(lambda x: str(x)[0:6])

df["part_1"] = df["scores"].apply(lambda x: x[0])
df["part_2"] = df["scores"].apply(lambda x: x[1])


star_summary = {}
for model in df.model.unique():
    df_model = df[df.model == model]
    # silver_stars = sum([s for s in df_model.part_1.to_list() if not isinstance(s, str)])
    # gold_stars = sum([s for s in df_model.part_2.to_list() if not isinstance(s, str)])
    silver_stars = df_model.part_1.sum()
    gold_stars = df_model.part_2.sum()
    total_stars = silver_stars + gold_stars
    star_summary[model] = {
        "Model": model,
        "Total Stars ⭐️": total_stars,
        "Part 1 ⭐️": silver_stars,
        "Part 2 ⭐️": gold_stars,
    }


def score_to_string(s):
    return "⭐️" if s else "❌"
    # if s == True:
    #     return "⭐️"
    # elif s == False:
    #     return "❌"
    # else:
    #     return "N/A"


with gr.Blocks() as demo:
    md = gr.Markdown("Performance of LLMs (and my own) on AoC 2024. For more info, read this [blog post](https://www.jerpint.io/blog/advent-of-code-llms/).")
    with gr.Tab("Stars"):
        star_df = pd.DataFrame.from_dict(star_summary, orient="index")
        gr_star_df = gr.DataFrame(star_df[["Model", "Total Stars ⭐️"]].sort_values(by="Total Stars ⭐️", ascending=False))
        gr_star_df = gr.DataFrame(star_df[["Model", "Part 1 ⭐️", "Part 2 ⭐️"]].sort_values(by="Part 1 ⭐️", ascending=False))
    with gr.Tab("Daily"):

        # Parse the info to something more readable
        df_daily = df[["model", "day", "part_1", "part_2", "Runtime (s)"]]
        df_daily["Part 1"] = df_daily["part_1"].apply(score_to_string)
        df_daily["Part 2"] = df_daily["part_2"].apply(score_to_string)
        df_daily = df_daily[["model", "day", "Part 1", "Part 2", "Runtime (s)"]]

        gr_df_daily = gr.DataFrame(df_daily.sort_values(by="day"))

    with gr.Tab("Outputs"):
        outputs_df = df[["model", "day", "Runtime (s)", "result"]].rename(columns={"result": "Output"})
        gr_outputs_df = gr.DataFrame(outputs_df, datatype=["str", "str", "str"])
    with gr.Tab("Code"):
        code_df = df[["model", "day", "code_md"]].rename(columns={"code_md": "Code"})
        gr_code_df = gr.DataFrame(code_df, datatype=['str', 'str', 'markdown'])

demo.launch()