File size: 4,313 Bytes
9990990
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import reflex as rx
import pandas as pd
import plotly.express as px
from reflex_ag_grid import ag_grid
from sklearn.metrics import accuracy_score
from ..sidebar import sidebar
from datasets import load_dataset

chart_md = """
Make sure you explore what happeened between:
- Base Model -> Final Answer
- Base Model -> Reasoning (Both models) Final Answer 
- Base Model -> Final Answer Reasoning (Both models)
"""

# Load the HF dataset
dataset = load_dataset("derek-thomas/labeled-multiple-choice-explained-mistral-results")

# Convert the dataset to a Pandas DataFrame
df = dataset['train'].to_pandas()

# Columns to analyze
cols_to_analyze = [
    "predictions_base",
    "predictions_FA",
    "predictions_RFA_mistral",
    "predictions_FAR_mistral",
    "predictions_RFA_gpt3_5",
    "predictions_FAR_gpt3_5",
    ]

# Mapping for renaming models
model_names = {
    "predictions_base": "Base Model",
    "predictions_FA": "Final Answer",
    "predictions_RFA_mistral": "Reasoning (Mistral) -> Final Answer)",
    "predictions_FAR_mistral": "Final Answer -> Reasoning (Mistral)",
    "predictions_RFA_gpt3_5": "Reasoning (GPT-3.5 ) -> Final Answer",
    "predictions_FAR_gpt3_5": "Final Answer -> Reasoning(GPT-3.5)",
    }

# Compute metrics for each model
metrics_data = []
for col in cols_to_analyze:
    accuracy = round(accuracy_score(df["answer_key"], df[col]) * 100, 2)
    metrics_data.append({"Prediction Type": model_names[col], "Accuracy (%)": accuracy})

# Create a DataFrame for metrics
metrics_df = pd.DataFrame(metrics_data)

# Column definitions for the metrics table
metrics_column_defs = [
    ag_grid.column_def(field="Prediction Type", header_name="Prediction Type", width=250),
    ag_grid.column_def(field="Accuracy (%)", header_name="Accuracy (%)"),
    ]


# Function to generate the topic performance star chart
def topic_star_chart():
    # Calculate per-topic accuracy
    topic_accuracy = []
    for topic in df["topic"].unique():
        topic_data = df[df["topic"] == topic]
        for col in cols_to_analyze:
            accuracy = round((topic_data[col] == topic_data["answer_key"]).mean() * 100, 2)
            topic_accuracy.append({"Topic": topic, "Prediction Type": model_names[col], "Accuracy (%)": accuracy})

    # Create DataFrame for visualization
    topic_df = pd.DataFrame(topic_accuracy)

    # Find the top 10 topics by number of rows
    topic_counts = df["topic"].value_counts().head(10).index
    filtered_topic_df = topic_df[topic_df["Topic"].isin(topic_counts)]

    # Create star chart (radar chart)
    fig = px.line_polar(
            filtered_topic_df,
            r="Accuracy (%)",
            theta="Topic",
            color="Prediction Type",
            title="Top 10 Topics: Per-Topic Performance Star Chart",
            line_close=True,  # Close the lines to form a star shape
            )
    fig.update_layout(width=900, height=900)

    return fig


def page():
    return rx.hstack(
            sidebar(),
            rx.vstack(
                    rx.heading("Results", size="9", margin="20px 0"),
                    rx.markdown("Here we have a sortable table of our experiments and the results"),
                    ag_grid(
                            id="ag_grid_metrics",
                            row_data=metrics_df.to_dict("records"),
                            column_defs=metrics_column_defs,
                            width="60%",
                            margin="20px auto",  # Center the table
                            size_columns_to_fit=True
                            ),
                    rx.markdown("\n---\n"),
                    rx.divider(),
                    rx.heading("Performance Star Chart", size="8", margin="20px 0"),
                    rx.text(
                            "The chart below shows how each model performed across the most popular top 10 topics by row count. "
                            "Each line represents a model, and the radial axis represents accuracy.",
                            font_size="md",
                            padding="10px",
                            ),
                    rx.markdown(chart_md),
                    rx.plotly(data=topic_star_chart()),  # Render the radar chart
                    padding="20px",
                    )
            )