Spaces:

derek-thomas
/

prompt-order-experiment

Sleeping

File size: 4,313 Bytes
import reflex as rx
import pandas as pd
import plotly.express as px
from reflex_ag_grid import ag_grid
from sklearn.metrics import accuracy_score
from ..sidebar import sidebar
from datasets import load_dataset

chart_md = """
Make sure you explore what happeened between:
- Base Model -> Final Answer
- Base Model -> Reasoning (Both models) Final Answer 
- Base Model -> Final Answer Reasoning (Both models)
"""

# Load the HF dataset
dataset = load_dataset("derek-thomas/labeled-multiple-choice-explained-mistral-results")

# Convert the dataset to a Pandas DataFrame
df = dataset['train'].to_pandas()

# Columns to analyze
cols_to_analyze = [
    "predictions_base",
    "predictions_FA",
    "predictions_RFA_mistral",
    "predictions_FAR_mistral",
    "predictions_RFA_gpt3_5",
    "predictions_FAR_gpt3_5",
    ]

# Mapping for renaming models
model_names = {
    "predictions_base": "Base Model",
    "predictions_FA": "Final Answer",
    "predictions_RFA_mistral": "Reasoning (Mistral) -> Final Answer)",
    "predictions_FAR_mistral": "Final Answer -> Reasoning (Mistral)",
    "predictions_RFA_gpt3_5": "Reasoning (GPT-3.5 ) -> Final Answer",
    "predictions_FAR_gpt3_5": "Final Answer -> Reasoning(GPT-3.5)",
    }

# Compute metrics for each model
metrics_data = []
for col in cols_to_analyze:
    accuracy = round(accuracy_score(df["answer_key"], df[col]) * 100, 2)
    metrics_data.append({"Prediction Type": model_names[col], "Accuracy (%)": accuracy})

# Create a DataFrame for metrics
metrics_df = pd.DataFrame(metrics_data)

# Column definitions for the metrics table
metrics_column_defs = [
    ag_grid.column_def(field="Prediction Type", header_name="Prediction Type", width=250),
    ag_grid.column_def(field="Accuracy (%)", header_name="Accuracy (%)"),
    ]


# Function to generate the topic performance star chart
def topic_star_chart():
    # Calculate per-topic accuracy
    topic_accuracy = []
    for topic in df["topic"].unique():
        topic_data = df[df["topic"] == topic]
        for col in cols_to_analyze:
            accuracy = round((topic_data[col] == topic_data["answer_key"]).mean() * 100, 2)
            topic_accuracy.append({"Topic": topic, "Prediction Type": model_names[col], "Accuracy (%)": accuracy})

    # Create DataFrame for visualization
    topic_df = pd.DataFrame(topic_accuracy)

    # Find the top 10 topics by number of rows
    topic_counts = df["topic"].value_counts().head(10).index
    filtered_topic_df = topic_df[topic_df["Topic"].isin(topic_counts)]

    # Create star chart (radar chart)
    fig = px.line_polar(
            filtered_topic_df,
            r="Accuracy (%)",
            theta="Topic",
            color="Prediction Type",
            title="Top 10 Topics: Per-Topic Performance Star Chart",
            line_close=True,  # Close the lines to form a star shape
            )
    fig.update_layout(width=900, height=900)

    return fig


def page():
    return rx.hstack(
            sidebar(),
            rx.vstack(
                    rx.heading("Results", size="9", margin="20px 0"),
                    rx.markdown("Here we have a sortable table of our experiments and the results"),
                    ag_grid(
                            id="ag_grid_metrics",
                            row_data=metrics_df.to_dict("records"),
                            column_defs=metrics_column_defs,
                            width="60%",
                            margin="20px auto",  # Center the table
                            size_columns_to_fit=True
                            ),
                    rx.markdown("\n---\n"),
                    rx.divider(),
                    rx.heading("Performance Star Chart", size="8", margin="20px 0"),
                    rx.text(
                            "The chart below shows how each model performed across the most popular top 10 topics by row count. "
                            "Each line represents a model, and the radial axis represents accuracy.",
                            font_size="md",
                            padding="10px",
                            ),
                    rx.markdown(chart_md),
                    rx.plotly(data=topic_star_chart()),  # Render the radar chart
                    padding="20px",
                    )
            )