Update app.py
Browse files
app.py
CHANGED
@@ -552,6 +552,17 @@ ugi_category_columns = [
|
|
552 |
create_numeric_column(col, width=120) for col in UGI_CATEGORY_COLS
|
553 |
]
|
554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
political_columns = [
|
556 |
{
|
557 |
"headerName": "Ideology",
|
@@ -810,12 +821,13 @@ app.layout = html.Div([
|
|
810 |
id='additional-columns-filter',
|
811 |
options=[
|
812 |
{'label': 'UGI Categories', 'value': 'ugi_categories'},
|
|
|
813 |
{'label': 'Political Test Axes', 'value': 'political_axes'}
|
814 |
],
|
815 |
value=[],
|
816 |
inline=True,
|
817 |
style={'display': 'inline-block'},
|
818 |
-
labelStyle={'fontWeight': 'normal', 'marginRight': '15px'}
|
819 |
)
|
820 |
], style={'marginBottom': '13px', 'padding': '0 20px', 'overflow': 'hidden'}),
|
821 |
|
@@ -865,15 +877,53 @@ app.layout = html.Div([
|
|
865 |
|
866 |
# Description
|
867 |
html.Div([
|
868 |
-
html.H3("About"),
|
|
|
|
|
|
|
869 |
|
870 |
-
html.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
871 |
|
872 |
-
html.P([html.Strong("W/10:"), " Willingness/10. A more narrow subset of the UGI questions, solely focused on measuring how far a model can be pushed before going against its instructions or refusing to answer."]),
|
873 |
|
874 |
-
html.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
875 |
|
876 |
-
html.P([
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
877 |
|
878 |
html.P([html.Strong("Coding:"), " A simple 50 question quiz measuring how vast a model's programming knowledge is. Each question is worth 2 points."]),
|
879 |
|
@@ -1002,10 +1052,17 @@ def update_columns(additional_columns):
|
|
1002 |
|
1003 |
# Add UGI category columns if selected
|
1004 |
if 'ugi_categories' in additional_columns:
|
1005 |
-
current_columns.extend(ugi_category_columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1006 |
|
1007 |
-
# Add remaining base columns (
|
1008 |
-
current_columns.extend(columnDefs[
|
1009 |
|
1010 |
# Add political columns if selected
|
1011 |
if 'political_axes' in additional_columns:
|
|
|
552 |
create_numeric_column(col, width=120) for col in UGI_CATEGORY_COLS
|
553 |
]
|
554 |
|
555 |
+
w10_type_columns = [
|
556 |
+
create_numeric_column("W/10-Direct", width=120, filterParams={
|
557 |
+
"defaultOption": "greaterThanOrEqual",
|
558 |
+
"filterOptions": ['equals', 'notEqual', 'greaterThan', 'greaterThanOrEqual', 'lessThan', 'lessThanOrEqual', 'inRange']
|
559 |
+
}),
|
560 |
+
create_numeric_column("W/10-Adherence", width=120, filterParams={
|
561 |
+
"defaultOption": "greaterThanOrEqual",
|
562 |
+
"filterOptions": ['equals', 'notEqual', 'greaterThan', 'greaterThanOrEqual', 'lessThan', 'lessThanOrEqual', 'inRange']
|
563 |
+
})
|
564 |
+
]
|
565 |
+
|
566 |
political_columns = [
|
567 |
{
|
568 |
"headerName": "Ideology",
|
|
|
821 |
id='additional-columns-filter',
|
822 |
options=[
|
823 |
{'label': 'UGI Categories', 'value': 'ugi_categories'},
|
824 |
+
{'label': 'W/10 Types', 'value': 'w10_types'},
|
825 |
{'label': 'Political Test Axes', 'value': 'political_axes'}
|
826 |
],
|
827 |
value=[],
|
828 |
inline=True,
|
829 |
style={'display': 'inline-block'},
|
830 |
+
labelStyle={'fontWeight': 'normal', 'marginRight': '15px'}
|
831 |
)
|
832 |
], style={'marginBottom': '13px', 'padding': '0 20px', 'overflow': 'hidden'}),
|
833 |
|
|
|
877 |
|
878 |
# Description
|
879 |
html.Div([
|
880 |
+
html.H3("About", style={'fontSize': '22px', 'marginBottom': '0px'}),
|
881 |
+
|
882 |
+
html.P([html.Strong("UGI:"), " Uncensored General Intelligence. A benchmark measuring both willingness to answer and accuracy in fact-based contentious questions. The test set is made of roughly 100 questions/tasks, covering topics that are commonly difficult to get LLMs to answer. The leaderboard's questions are kept private in order to avoid the common problem of not knowing if a model is intelligent or if it was just trained on the test questions."],
|
883 |
+
style={'marginTop': '7px', 'marginBottom': '4px'}),
|
884 |
|
885 |
+
html.Details([
|
886 |
+
html.Summary("Categories",
|
887 |
+
style={
|
888 |
+
'fontWeight': 'normal',
|
889 |
+
'fontSize': '1em',
|
890 |
+
'marginLeft': '20px',
|
891 |
+
'cursor': 'pointer'
|
892 |
+
}),
|
893 |
+
html.Ul([
|
894 |
+
html.Li("Unruly: Taboo Underground Knowledge"),
|
895 |
+
html.Li("Internet: Knowledge of controversial/explicit web content"),
|
896 |
+
html.Li("Societal/Political: Awareness of contentious socio-political issues")
|
897 |
+
], style={'marginTop': '0px', 'marginBottom': '16px', 'marginLeft': '40px'})
|
898 |
+
], style={'marginBottom': '16px'}),
|
899 |
|
900 |
+
html.P([html.Strong("W/10:"), " Willingness/10. A more narrow subset of the UGI questions, solely focused on measuring how far a model can be pushed before going against its instructions or refusing to answer."], style={'marginBottom': '4px'}),
|
901 |
|
902 |
+
html.Details([
|
903 |
+
html.Summary("Types",
|
904 |
+
style={
|
905 |
+
'fontWeight': 'normal',
|
906 |
+
'fontSize': '1em',
|
907 |
+
'marginLeft': '20px',
|
908 |
+
'cursor': 'pointer'
|
909 |
+
}),
|
910 |
+
html.Ul([
|
911 |
+
html.Li("Direct: Measures if the model directly refuses to respond to certain prompts"),
|
912 |
+
html.Li("Adherence: Some models might not explicitly refuse to do something, though will still deviate from the instructions as a way to get out of doing it, or simply due to lack of instruction following capabilities")
|
913 |
+
], style={'marginTop': '0px', 'marginBottom': '16px', 'marginLeft': '40px'})
|
914 |
+
], style={'marginBottom': '16px'}),
|
915 |
|
916 |
+
html.P([
|
917 |
+
"A high UGI but low W/10 could mean for example that the model can provide a lot of accurate sensitive information, but will refuse to form the information into something it sees as offensive or against its rules.",
|
918 |
+
html.Br(),
|
919 |
+
html.Br()
|
920 |
+
]),
|
921 |
+
|
922 |
+
html.P([
|
923 |
+
html.Strong("Benchmarks not focused on censorship:"),
|
924 |
+
html.Div(style={'margin': '6px 0'}),
|
925 |
+
html.Strong("NatInt:"), " Natural Intelligence. A general knowledge quiz covering real-world subjects that llms are not commonly benchmarked on, such as pop culture trivia. This measures if the model understands a diverse range of topics, as opposed to over-training on textbook information and the types of questions commonly tested on benchmarks."
|
926 |
+
]),
|
927 |
|
928 |
html.P([html.Strong("Coding:"), " A simple 50 question quiz measuring how vast a model's programming knowledge is. Each question is worth 2 points."]),
|
929 |
|
|
|
1052 |
|
1053 |
# Add UGI category columns if selected
|
1054 |
if 'ugi_categories' in additional_columns:
|
1055 |
+
current_columns.extend(ugi_category_columns)
|
1056 |
+
|
1057 |
+
# Add W/10 column
|
1058 |
+
current_columns.extend(columnDefs[7:8]) # Add just the W/10 column
|
1059 |
+
|
1060 |
+
# Add W/10 type columns if selected
|
1061 |
+
if 'w10_types' in additional_columns:
|
1062 |
+
current_columns.extend(w10_type_columns)
|
1063 |
|
1064 |
+
# Add remaining base columns (NatInt, Coding, Political Lean)
|
1065 |
+
current_columns.extend(columnDefs[8:11])
|
1066 |
|
1067 |
# Add political columns if selected
|
1068 |
if 'political_axes' in additional_columns:
|