Update README.md
Browse files
README.md
CHANGED
@@ -93,43 +93,73 @@ Evaluation was conducted using [lm-evaluation-harness](https://github.com/Eleuth
|
|
93 |
</td>
|
94 |
</tr>
|
95 |
<tr>
|
96 |
-
<td>
|
|
|
|
|
|
|
|
|
97 |
</td>
|
98 |
-
<td>
|
|
|
|
|
|
|
|
|
99 |
</td>
|
100 |
-
<td>
|
|
|
|
|
101 |
</td>
|
102 |
<td>100.0%
|
103 |
</td>
|
104 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
<tr>
|
106 |
<td>Winogrande (5-shot)
|
107 |
</td>
|
108 |
-
<td>
|
109 |
</td>
|
110 |
-
<td>
|
111 |
</td>
|
112 |
-
<td>99.
|
113 |
</td>
|
114 |
</tr>
|
115 |
<tr>
|
116 |
<td>TruthfulQA (0-shot, mc2)
|
117 |
</td>
|
118 |
-
<td>
|
119 |
</td>
|
120 |
-
<td>
|
121 |
</td>
|
122 |
-
<td>99.
|
123 |
</td>
|
124 |
</tr>
|
125 |
<tr>
|
126 |
<td><strong>Average</strong>
|
127 |
</td>
|
128 |
-
<td><strong>
|
129 |
</td>
|
130 |
-
<td><strong>
|
131 |
</td>
|
132 |
-
<td><strong>
|
133 |
</td>
|
134 |
</tr>
|
135 |
</table>
|
|
|
93 |
</td>
|
94 |
</tr>
|
95 |
<tr>
|
96 |
+
<td>MMLU (5-shot)
|
97 |
+
</td>
|
98 |
+
<td>47.57
|
99 |
+
</td>
|
100 |
+
<td>47.81
|
101 |
</td>
|
102 |
+
<td>100.5%
|
103 |
+
</td>
|
104 |
+
</tr>
|
105 |
+
<tr>
|
106 |
+
<td>Arc Challenge (25-shot)
|
107 |
</td>
|
108 |
+
<td>34.90
|
109 |
+
</td>
|
110 |
+
<td>34.90
|
111 |
</td>
|
112 |
<td>100.0%
|
113 |
</td>
|
114 |
</tr>
|
115 |
+
<tr>
|
116 |
+
<td>GSM-8k (5-shot, strict-match)
|
117 |
+
</td>
|
118 |
+
<td>34.19
|
119 |
+
</td>
|
120 |
+
<td>33.51
|
121 |
+
</td>
|
122 |
+
<td>98.0%
|
123 |
+
</td>
|
124 |
+
</tr>
|
125 |
+
<tr>
|
126 |
+
<td>Hellaswag (10-shot)
|
127 |
+
</td>
|
128 |
+
<td>51.83
|
129 |
+
</td>
|
130 |
+
<td>51.78
|
131 |
+
</td>
|
132 |
+
<td>99.9%
|
133 |
+
</td>
|
134 |
+
</tr>
|
135 |
<tr>
|
136 |
<td>Winogrande (5-shot)
|
137 |
</td>
|
138 |
+
<td>55.80
|
139 |
</td>
|
140 |
+
<td>55.49
|
141 |
</td>
|
142 |
+
<td>99.4%
|
143 |
</td>
|
144 |
</tr>
|
145 |
<tr>
|
146 |
<td>TruthfulQA (0-shot, mc2)
|
147 |
</td>
|
148 |
+
<td>39.90
|
149 |
</td>
|
150 |
+
<td>39.71
|
151 |
</td>
|
152 |
+
<td>99.5%
|
153 |
</td>
|
154 |
</tr>
|
155 |
<tr>
|
156 |
<td><strong>Average</strong>
|
157 |
</td>
|
158 |
+
<td><strong>44.0</strong>
|
159 |
</td>
|
160 |
+
<td><strong>43.9</strong>
|
161 |
</td>
|
162 |
+
<td><strong>99.6%</strong>
|
163 |
</td>
|
164 |
</tr>
|
165 |
</table>
|