Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	
		Clémentine
		
	commited on
		
		
					Commit 
							
							·
						
						314f91a
	
1
								Parent(s):
							
							1257fc3
								
fixs
Browse files- src/display/about.py +2 -2
- src/display/utils.py +1 -0
- src/leaderboard/filter_models.py +0 -50
- src/populate.py +1 -4
- src/submission/submit.py +2 -2
    	
        src/display/about.py
    CHANGED
    
    | @@ -1,6 +1,5 @@ | |
| 1 | 
            -
            from src.display.utils import ModelType
         | 
| 2 | 
            -
            from enum import Enum
         | 
| 3 | 
             
            from dataclasses import dataclass
         | 
|  | |
| 4 |  | 
| 5 | 
             
            @dataclass
         | 
| 6 | 
             
            class Task:
         | 
| @@ -8,6 +7,7 @@ class Task: | |
| 8 | 
             
                metric: str
         | 
| 9 | 
             
                col_name: str
         | 
| 10 |  | 
|  | |
| 11 | 
             
            # Init: to update with your specific keys
         | 
| 12 | 
             
            class Tasks(Enum):
         | 
| 13 | 
             
                task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
         | 
|  | |
|  | |
|  | |
| 1 | 
             
            from dataclasses import dataclass
         | 
| 2 | 
            +
            from enum import Enum
         | 
| 3 |  | 
| 4 | 
             
            @dataclass
         | 
| 5 | 
             
            class Task:
         | 
|  | |
| 7 | 
             
                metric: str
         | 
| 8 | 
             
                col_name: str
         | 
| 9 |  | 
| 10 | 
            +
             | 
| 11 | 
             
            # Init: to update with your specific keys
         | 
| 12 | 
             
            class Tasks(Enum):
         | 
| 13 | 
             
                task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
         | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -8,6 +8,7 @@ from src.display.about import Tasks | |
| 8 | 
             
            def fields(raw_class):
         | 
| 9 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
| 10 |  | 
|  | |
| 11 | 
             
            # These classes are for user facing column names,
         | 
| 12 | 
             
            # to avoid having to change them all around the code
         | 
| 13 | 
             
            # when a modif is needed
         | 
|  | |
| 8 | 
             
            def fields(raw_class):
         | 
| 9 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
| 10 |  | 
| 11 | 
            +
             | 
| 12 | 
             
            # These classes are for user facing column names,
         | 
| 13 | 
             
            # to avoid having to change them all around the code
         | 
| 14 | 
             
            # when a modif is needed
         | 
    	
        src/leaderboard/filter_models.py
    DELETED
    
    | @@ -1,50 +0,0 @@ | |
| 1 | 
            -
            from src.display.formatting import model_hyperlink
         | 
| 2 | 
            -
            from src.display.utils import AutoEvalColumn
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            # Models which have been flagged by users as being problematic for a reason or another
         | 
| 5 | 
            -
            # (Model name to forum discussion link)
         | 
| 6 | 
            -
            FLAGGED_MODELS = {
         | 
| 7 | 
            -
                "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
         | 
| 8 | 
            -
                "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
         | 
| 9 | 
            -
                "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
         | 
| 10 | 
            -
                "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
         | 
| 11 | 
            -
                "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
         | 
| 12 | 
            -
                "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
         | 
| 13 | 
            -
                "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
         | 
| 14 | 
            -
                "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
         | 
| 15 | 
            -
                "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
         | 
| 16 | 
            -
            }
         | 
| 17 | 
            -
             | 
| 18 | 
            -
            # Models which have been requested by orgs to not be submitted on the leaderboard
         | 
| 19 | 
            -
            DO_NOT_SUBMIT_MODELS = [
         | 
| 20 | 
            -
                "Voicelab/trurl-2-13b",  # trained on MMLU
         | 
| 21 | 
            -
            ]
         | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
            def flag_models(leaderboard_data: list[dict]):
         | 
| 25 | 
            -
                for model_data in leaderboard_data:
         | 
| 26 | 
            -
                    if model_data["model_name_for_query"] in FLAGGED_MODELS:
         | 
| 27 | 
            -
                        issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
         | 
| 28 | 
            -
                        issue_link = model_hyperlink(
         | 
| 29 | 
            -
                            FLAGGED_MODELS[model_data["model_name_for_query"]],
         | 
| 30 | 
            -
                            f"See discussion #{issue_num}",
         | 
| 31 | 
            -
                        )
         | 
| 32 | 
            -
                        model_data[
         | 
| 33 | 
            -
                            AutoEvalColumn.model.name
         | 
| 34 | 
            -
                        ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
         | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
            def remove_forbidden_models(leaderboard_data: list[dict]):
         | 
| 38 | 
            -
                indices_to_remove = []
         | 
| 39 | 
            -
                for ix, model in enumerate(leaderboard_data):
         | 
| 40 | 
            -
                    if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
         | 
| 41 | 
            -
                        indices_to_remove.append(ix)
         | 
| 42 | 
            -
             | 
| 43 | 
            -
                for ix in reversed(indices_to_remove):
         | 
| 44 | 
            -
                    leaderboard_data.pop(ix)
         | 
| 45 | 
            -
                return leaderboard_data
         | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
            def filter_models(leaderboard_data: list[dict]):
         | 
| 49 | 
            -
                leaderboard_data = remove_forbidden_models(leaderboard_data)
         | 
| 50 | 
            -
                flag_models(leaderboard_data)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
    	
        src/populate.py
    CHANGED
    
    | @@ -4,16 +4,13 @@ import os | |
| 4 | 
             
            import pandas as pd
         | 
| 5 |  | 
| 6 | 
             
            from src.display.formatting import has_no_nan_values, make_clickable_model
         | 
| 7 | 
            -
            from src.display.utils import AutoEvalColumn, EvalQueueColumn | 
| 8 | 
            -
            from src.leaderboard.filter_models import filter_models
         | 
| 9 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
| 10 |  | 
| 11 |  | 
| 12 | 
             
            def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
         | 
| 13 | 
             
                raw_data = get_raw_eval_results(results_path, requests_path)
         | 
| 14 | 
             
                all_data_json = [v.to_dict() for v in raw_data]
         | 
| 15 | 
            -
                all_data_json.append(baseline_row)
         | 
| 16 | 
            -
                filter_models(all_data_json)
         | 
| 17 |  | 
| 18 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
| 19 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
|  | |
| 4 | 
             
            import pandas as pd
         | 
| 5 |  | 
| 6 | 
             
            from src.display.formatting import has_no_nan_values, make_clickable_model
         | 
| 7 | 
            +
            from src.display.utils import AutoEvalColumn, EvalQueueColumn
         | 
|  | |
| 8 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
| 9 |  | 
| 10 |  | 
| 11 | 
             
            def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
         | 
| 12 | 
             
                raw_data = get_raw_eval_results(results_path, requests_path)
         | 
| 13 | 
             
                all_data_json = [v.to_dict() for v in raw_data]
         | 
|  | |
|  | |
| 14 |  | 
| 15 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
| 16 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
    	
        src/submission/submit.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ import os | |
| 3 | 
             
            from datetime import datetime, timezone
         | 
| 4 |  | 
| 5 | 
             
            from src.display.formatting import styled_error, styled_message, styled_warning
         | 
| 6 | 
            -
            from src.envs import API, EVAL_REQUESTS_PATH,  | 
| 7 | 
             
            from src.submission.check_validity import (
         | 
| 8 | 
             
                already_submitted_models,
         | 
| 9 | 
             
                check_model_card,
         | 
| @@ -45,7 +45,7 @@ def add_new_eval( | |
| 45 |  | 
| 46 | 
             
                # Is the model on the hub?
         | 
| 47 | 
             
                if weight_type in ["Delta", "Adapter"]:
         | 
| 48 | 
            -
                    base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token= | 
| 49 | 
             
                    if not base_model_on_hub:
         | 
| 50 | 
             
                        return styled_error(f'Base model "{base_model}" {error}')
         | 
| 51 |  | 
|  | |
| 3 | 
             
            from datetime import datetime, timezone
         | 
| 4 |  | 
| 5 | 
             
            from src.display.formatting import styled_error, styled_message, styled_warning
         | 
| 6 | 
            +
            from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
         | 
| 7 | 
             
            from src.submission.check_validity import (
         | 
| 8 | 
             
                already_submitted_models,
         | 
| 9 | 
             
                check_model_card,
         | 
|  | |
| 45 |  | 
| 46 | 
             
                # Is the model on the hub?
         | 
| 47 | 
             
                if weight_type in ["Delta", "Adapter"]:
         | 
| 48 | 
            +
                    base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
         | 
| 49 | 
             
                    if not base_model_on_hub:
         | 
| 50 | 
             
                        return styled_error(f'Base model "{base_model}" {error}')
         | 
| 51 |  | 
