Spaces:

jerpint
/

advent24-llm

Running

File size: 3,352 Bytes

ec9edcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84520b3
 
 
 
 
 
 
 
 
 
 
 
 
 
ec9edcd
 
 
 
 
5fa3b50
ec9edcd
84520b3
 
ec9edcd
 
 
 
 
 
 
84520b3
 
ec9edcd
 
 
 
 
 
ceb41f3
 
ec9edcd
 
 
84520b3
 
 
 
 
 
 
 
 
 
ec9edcd
fa7fc18
ec9edcd
5fa3b50
 
 
ec9edcd
 
 
84520b3
 
 
ec9edcd
 
 
 
84520b3
5fa3b50
 
 
 
 
ec9edcd

import gradio as gr
import pandas as pd
import json

from evaluate import get_solution_code


# For now, only evaluate first 9 days
df = pd.read_csv("results.csv")

with open("solutions.json") as f:
    solutions = json.load(f)

def score_submissions(row):
    result = row["result"]
    day = row["day"]
    solution = solutions[str(day)]

    score_1 = solution[0] in result
    score_2 = solution[1] in result

    # if solution[0] == "N/A":
    #     score_1 = "N/A"
    # else:
    #     score_1 = solution[0] in result


    # if not score_1:
    #     score_2 = False  # Can't get to level 2 without level 1
    # elif solution[1] == "N/A":
    #     score_2 =  "N/A"  # Won't evaluate an answer i dont have
    # else:
    #     score_2 = solution[1] in result

    return [score_1, score_2]


df["scores"] = df.apply(score_submissions, axis=1)
df["code"] = df.apply(lambda x: get_solution_code(day = x["day"], model=x["model"]), axis=1)
df["code_md"] = df.code.apply(lambda x: "```python\n"+x+"\n```")

df["Runtime (s)"] = df["total_time"].apply(lambda x: str(x)[0:6])

df["part_1"] = df["scores"].apply(lambda x: x[0])
df["part_2"] = df["scores"].apply(lambda x: x[1])


star_summary = {}
for model in df.model.unique():
    df_model = df[df.model == model]
    # silver_stars = sum([s for s in df_model.part_1.to_list() if not isinstance(s, str)])
    # gold_stars = sum([s for s in df_model.part_2.to_list() if not isinstance(s, str)])
    silver_stars = df_model.part_1.sum()
    gold_stars = df_model.part_2.sum()
    total_stars = silver_stars + gold_stars
    star_summary[model] = {
        "Model": model,
        "Total Stars ⭐️": total_stars,
        "Part 1 ⭐️": silver_stars,
        "Part 2 ⭐️": gold_stars,
    }


def score_to_string(s):
    return "⭐️" if s else "❌"
    # if s == True:
    #     return "⭐️"
    # elif s == False:
    #     return "❌"
    # else:
    #     return "N/A"


with gr.Blocks() as demo:
    md = gr.Markdown("Performance of LLMs (and my own) on AoC 2024. For more info, read this [blog post](https://www.jerpint.io/blog/advent-of-code-llms/).")
    with gr.Tab("Stars"):
        star_df = pd.DataFrame.from_dict(star_summary, orient="index")
        gr_star_df = gr.DataFrame(star_df[["Model", "Total Stars ⭐️"]].sort_values(by="Total Stars ⭐️", ascending=False))
        gr_star_df = gr.DataFrame(star_df[["Model", "Part 1 ⭐️", "Part 2 ⭐️"]].sort_values(by="Part 1 ⭐️", ascending=False))
    with gr.Tab("Daily"):

        # Parse the info to something more readable
        df_daily = df[["model", "day", "part_1", "part_2", "Runtime (s)"]]
        df_daily["Part 1"] = df_daily["part_1"].apply(score_to_string)
        df_daily["Part 2"] = df_daily["part_2"].apply(score_to_string)
        df_daily = df_daily[["model", "day", "Part 1", "Part 2", "Runtime (s)"]]

        gr_df_daily = gr.DataFrame(df_daily.sort_values(by="day"))

    with gr.Tab("Outputs"):
        outputs_df = df[["model", "day", "Runtime (s)", "result"]].rename(columns={"result": "Output"})
        gr_outputs_df = gr.DataFrame(outputs_df, datatype=["str", "str", "str"])
    with gr.Tab("Code"):
        code_df = df[["model", "day", "code_md"]].rename(columns={"code_md": "Code"})
        gr_code_df = gr.DataFrame(code_df, datatype=['str', 'str', 'markdown'])

demo.launch()