Spaces:

atlasia
/

Open-Arabic-Dialect-Identification-Leaderboard

Running

Open-Arabic-Dialect-Identification-Leaderboard

File size: 8,582 Bytes

be25a4c

import os
import pandas as pd
from utils import (
    update_leaderboard_multilingual, 
    handle_evaluation,
    process_results_file,
    create_html_image,
)
from datasets import load_dataset
import gradio as gr

if __name__ == "__main__":
    # Evaluation dataset path
    DATA_PATH = "atlasia/No-Arabic-Dialect-Left-Behind-Filtered-Balanced"
    # All Metrics
    metrics = [
        'f1_score',
        'precision',
        'recall',
        'specificity',
        'false_positive_rate',
        'false_negative_rate',
        'negative_predictive_value',
        'n_test_samples',
    ]
    # Default metrics to display    
    default_metrics = [
        'f1_score', 
        'precision', 
        'recall', 
        'false_positive_rate', 
        'false_negative_rate'
    ]
    # default language to display
    default_languages = [
        'Morocco',
        'MSA',
        'Egypt',
        'Algeria',
        'Tunisia',
        'Levantine',
    ]
    
    # Load test dataset
    test_dataset = load_dataset(DATA_PATH, split='test')   
    # Supported dialects
    supported_dialects = list(test_dataset.unique("dialect")) + ['All']
    
    with gr.Blocks() as app:
        base_path = os.path.dirname(__file__)
        local_image_path = os.path.join(base_path, 'open_arabic_lid_arena.png')
        
        gr.HTML(create_html_image(local_image_path))
        gr.Markdown("# 🏅 Open Arabic Dialect Identification Leaderboard")
        
        # Multilingual model leaderboard
        with gr.Tab("Multilingual model leaderboard"):
            gr.Markdown("""

                        Complete leaderboard across multiple arabic dialects. 

                        Compare the performance of different models across various metrics such as FNR, FPR, and other clasical metrics.

                        """
            )
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Select country to display")
                    country_selector = gr.Dropdown(
                        choices=supported_dialects,
                        value='Morocco', # Default to Morocco of course
                        label="Country"
                    )
                
                with gr.Column(scale=2):
                    gr.Markdown("### Select metrics to display")
                    metric_checkboxes = gr.CheckboxGroup(
                        choices=metrics,
                        value=default_metrics,
                        label="Metrics"
                    )
            
            with gr.Row():
                leaderboard_table = gr.DataFrame(
                    interactive=False
                )

            gr.Markdown("</br>")
            
            gr.Markdown("## Contribute to the Leaderboard")
            gr.Markdown("""

                        We welcome contributions from the community! 

                        If you have a model that you would like to see on the leaderboard, please use the 'Evaluate a model' or 'Upload your results' tabs to submit your model's performance.

                        Let's work together to improve Arabic dialect identification! 🚀

                        """)
            
        # Binary model leaderboard
        with gr.Tab("One-vs-All leaderboard"):
            
            gr.Markdown("""

                        A kind of one-vs-all approach for evaluating LID models across multiple arabic dialects. 

                        Computes the `false_positive_rate` of different models for a given target language.

                        This should help you understand how well a model can identify a specific dialect by

                        showing how often it misclassifies other dialects as the target dialect.

                        """
            )
            
            with gr.Column(scale=1):
                gr.Markdown("### Select target language")
                target_language_selector = gr.Dropdown(
                    choices=supported_dialects,
                    value='Morocco', # Default to Morocco of course
                    label="Target Language"
                )
            
            with gr.Column(scale=2):
                gr.Markdown("### Select Languages to display")
                languages_checkboxes = gr.CheckboxGroup(
                    choices=supported_dialects,
                    value=default_languages,
                    label="Languages"
                )
                
            with gr.Row():
                binary_leaderboard_table = gr.DataFrame(
                    interactive=False
                )
            

        with gr.Tab("Evaluate a model"):
            gr.Markdown("Suggest a model to evaluate 🤗 (Supports only **Fasttext** models as SfayaLID, GlotLID, OpenLID, etc.)")
            gr.Markdown("For other models, you are welcome to **submit your results** through the upload section.")
            
            model_path = gr.Textbox(label="Model Path", placeholder='path/to/model')
            model_path_bin = gr.Textbox(label=".bin filename", placeholder='model.bin')
            gr.Markdown("### **⚠️ To ensure correct results, tick this when the model's labels are the iso_codes**")
            use_mapping = gr.Checkbox(label="Does not map to country")
            eval_button = gr.Button("Evaluate", value=False)  # Initially disabled
            
            eval_button.click(handle_evaluation, inputs=[model_path, model_path_bin, use_mapping], outputs=[leaderboard_table])

        with gr.Tab("Upload your results"):

            # Define a code block to display
            code_snippet = """ 

            ```python

            

            # Load your model

            model = ... # Load your model here

            

            # Load evaluation benchmark

            eval_dataset = load_dataset("atlasia/No-Arabic-Dialect-Left-Behind-Filtered-Balanced", split='test').to_pandas() # do not change this line :)

            

            # Predict labels using your model

            eval_dataset['preds'] = eval_dataset['text'].apply(lambda text: predict_label(text, model)) # predict_label is a function that you need to define for your model

            

            # now drop the columns that are not needed, i.e. 'text', 'metadata' and 'dataset_source'

            df_eval = df_eval.drop(columns=['text', 'metadata', 'dataset_source'])

            df_eval.to_csv('your_model_name.csv')

            

            # submit your results: 'your_model_name.csv' to the leaderboard

            

            ```

            """
            gr.Markdown("## Upload your results to the leaderboard 🚀")
            gr.Markdown("### Submission guidelines: Run the test dataset on your model and save the results in a CSV file. Bellow a code snippet to help you with that.")
            gr.Markdown(code_snippet)

            uploaded_model_name = gr.Textbox(label="Model name", placeholder='Your model/team name')
            file = gr.File(label="Upload your results")
            upload_button = gr.Button("Upload")
            upload_button.click(process_results_file, inputs=[file, uploaded_model_name], outputs=[leaderboard_table])
        
        # Update multilangual table when any input changes
        country_selector.change(
            update_leaderboard_multilingual,
            inputs=[country_selector, metric_checkboxes],
            outputs=leaderboard_table
        )
        
        metric_checkboxes.change(
            update_leaderboard_multilingual,
            inputs=[country_selector, metric_checkboxes],
            outputs=leaderboard_table
        )
        
        # Update binary table when any input changes
        target_language_selector.change(
            update_leaderboard_multilingual,
            inputs=[country_selector, metric_checkboxes],
            outputs=leaderboard_table
        )
        
        languages_checkboxes.change(
            update_leaderboard_multilingual,
            inputs=[country_selector, metric_checkboxes],
            outputs=leaderboard_table
        )
        
        # Define load event to run at startup
        app.load(
            update_leaderboard_multilingual,
            inputs=[country_selector, metric_checkboxes],
            outputs=leaderboard_table
        )
        
    app.launch(allowed_paths=[base_path])