kpfadnis commited on
Commit
4594d15
·
1 Parent(s): d769391

fix (export): Minor fix to export with additional notebook to merge.

Browse files
notebooks/merge_input_files.ipynb ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "pycharm": {
9
+ "name": "#%% md\n"
10
+ }
11
+ },
12
+ "source": [
13
+ "# Merge input files\n",
14
+ "\n",
15
+ "### ✅ Prerequisites\n",
16
+ "\n",
17
+ "[Python 3.10](https://www.python.org/downloads/)\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {},
23
+ "source": [
24
+ "> [!CAUTION]\n",
25
+ "> Please make sure all input files are valid before trying to merge them. You can check validity of each file with a [`validate_input_file`](./validate_input_file.ipynb) notebook.\n",
26
+ "\n",
27
+ "\n",
28
+ "> [!IMPORTANT]\n",
29
+ "> Only common `tasks` across all input files and associated documents, models and evaluations are preserved in the resultant file.\n",
30
+ "\n",
31
+ "### Merge function"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 13,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "from typing import Dict, Set\n",
41
+ "import json\n",
42
+ "\n",
43
+ "\n",
44
+ "# =========================================================\n",
45
+ "# HELPER FUNCTIONS\n",
46
+ "# =========================================================\n",
47
+ "def read_json(filename: str, encoding=\"utf-8\"):\n",
48
+ " with open(filename, mode=\"r\", encoding=encoding) as fp:\n",
49
+ " return json.load(fp)\n",
50
+ "\n",
51
+ "\n",
52
+ "def write_json(filename: str, content: dict, encoding=\"utf-8\"):\n",
53
+ " with open(filename, mode=\"w\", encoding=encoding) as fp:\n",
54
+ " return json.dump(content, fp)\n",
55
+ "\n",
56
+ "\n",
57
+ "# =========================================================\n",
58
+ "# MAIN FUNCTION\n",
59
+ "# =========================================================\n",
60
+ "def merge(inputs: list[dict]) -> dict:\n",
61
+ " # Step 1: Return, if single JSON\n",
62
+ " if len(inputs) == 1:\n",
63
+ " return inputs[0]\n",
64
+ "\n",
65
+ " # Step 2: When multiple input JSONs\n",
66
+ " # Step 2.a: Initialize necessary variables\n",
67
+ " merged_tasks: Dict[str, dict] = {}\n",
68
+ " tasks_to_models: Dict[str, Set[str]] = {}\n",
69
+ " evaluations: Dict[str, dict] = {}\n",
70
+ " all_models = {}\n",
71
+ " all_filters = set()\n",
72
+ "\n",
73
+ " # Step 2.b: Iterate over each input JSON\n",
74
+ " for entry in inputs:\n",
75
+ " # Step 2.b.i: Add model to dictionary of all models, if not present already\n",
76
+ " for model in entry[\"models\"]:\n",
77
+ " if model[\"model_id\"] in all_models:\n",
78
+ " if model[\"name\"] != all_models[model[\"model_id\"]][\"name\"]:\n",
79
+ " print(\n",
80
+ " f\"Mismatched model information for model with id: ${model['model_id']}\"\n",
81
+ " )\n",
82
+ " else:\n",
83
+ " all_models[model[\"model_id\"]] = model\n",
84
+ "\n",
85
+ " # Step 2.b.ii: Add filters to set of all filter\n",
86
+ " if \"filters\" in entry and entry[\"filters\"]:\n",
87
+ " for filter in entry[\"filters\"]:\n",
88
+ " all_filters.add(filter)\n",
89
+ "\n",
90
+ " # Step 2.b.iii: Iterate over each evaluation\n",
91
+ " for evaluation in entry[\"evaluations\"]:\n",
92
+ " # Step 2.b.iii.*: Extend map of task IDs to model IDs based on evaluations\n",
93
+ " try:\n",
94
+ " tasks_to_models[evaluation[\"task_id\"]].add(evaluation[\"model_id\"])\n",
95
+ " except KeyError:\n",
96
+ " tasks_to_models[evaluation[\"task_id\"]] = set([evaluation[\"model_id\"]])\n",
97
+ "\n",
98
+ " # Step 2.b.iii.*: Extend evaluations map, if necessary\n",
99
+ " if (\n",
100
+ " f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
101
+ " not in evaluations\n",
102
+ " ):\n",
103
+ " evaluations[\n",
104
+ " f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
105
+ " ] = evaluation\n",
106
+ "\n",
107
+ " # Step 2.b.iv: Create merged tasks as follows\n",
108
+ " # 1. Merge comments for same task from different input JSONs\n",
109
+ " # 2. Merge flagged status for same task from different input JSONs (preserved flagged=True, if any of the input JSONs has it to be 'True')\n",
110
+ " for task in entry[\"tasks\"]:\n",
111
+ " if task[\"task_id\"] in merged_tasks:\n",
112
+ " if \"comments\" in task and task[\"comments\"]:\n",
113
+ " try:\n",
114
+ " merged_tasks[task[\"task_id\"]][\"comments\"].extend(\n",
115
+ " task[\"comments\"]\n",
116
+ " )\n",
117
+ " except KeyError:\n",
118
+ " merged_tasks[task[\"task_id\"]][\"comments\"] = [task[\"comments\"]]\n",
119
+ "\n",
120
+ " if \"flagged\" in task:\n",
121
+ " try:\n",
122
+ " merged_tasks[task[\"task_id\"]][\"flagged\"] = (\n",
123
+ " merged_tasks[task[\"task_id\"]][\"flagged\"] or task[\"flagged\"]\n",
124
+ " )\n",
125
+ " except KeyError:\n",
126
+ " merged_tasks[task[\"task_id\"]][\"flagged\"] = task[\"flagged\"]\n",
127
+ " else:\n",
128
+ " merged_tasks[task[\"task_id\"]] = task\n",
129
+ "\n",
130
+ " # Step 3: Find candidate models\n",
131
+ " # Criterion: A group of models which has evaluations for all tasks\n",
132
+ " candidate_models = {\n",
133
+ " model_id: all_models[model_id]\n",
134
+ " for model_id in set.intersection(*list(tasks_to_models.values()))\n",
135
+ " }\n",
136
+ "\n",
137
+ " # Step 4: Create potential filters\n",
138
+ " candidate_filters = all_filters\n",
139
+ " for task in merged_tasks.values():\n",
140
+ " candidate_filters = candidate_filters.intersection(task.keys())\n",
141
+ "\n",
142
+ " # Step 4: Return\n",
143
+ " if candidate_models:\n",
144
+ " return {\n",
145
+ " \"name\": f\"Merged from ${len(inputs)} files\",\n",
146
+ " \"filters\": list(candidate_filters),\n",
147
+ " \"models\": list(candidate_models.values()),\n",
148
+ " \"metrics\": inputs[0][\"metrics\"],\n",
149
+ " \"documents\": inputs[0][\"documents\"],\n",
150
+ " \"tasks\": inputs[0][\"tasks\"],\n",
151
+ " \"evaluations\": [\n",
152
+ " evaluations[f\"{task['task_id']}<:SEP:>{model_id}\"]\n",
153
+ " for task in inputs[0][\"tasks\"]\n",
154
+ " for model_id in candidate_models\n",
155
+ " ],\n",
156
+ " }\n",
157
+ " else:\n",
158
+ " print(\"Failed to find models with evaluations for all tasks.\")\n",
159
+ " return None\n",
160
+ "\n",
161
+ "\n",
162
+ "# =========================================================\n",
163
+ "# EXECUTE\n",
164
+ "# =========================================================\n",
165
+ "# Step 1: Load input files to be merged\n",
166
+ "inputs = [\n",
167
+ " read_json(\n",
168
+ " filename=\"<PATH TO INPUT JSON 1>\"\n",
169
+ " ),\n",
170
+ " read_json(\n",
171
+ " filename=\"<PATH TO INPUT JSON 2>\"\n",
172
+ " ),\n",
173
+ "]\n",
174
+ "\n",
175
+ "# Step 2: Run merging function\n",
176
+ "output = merge(inputs=inputs)\n",
177
+ "\n",
178
+ "# Step 3: Save merged output\n",
179
+ "if output:\n",
180
+ " write_json(\n",
181
+ " filename=\"<PATH TO MERGED FILE>\",\n",
182
+ " content=output,\n",
183
+ " )"
184
+ ]
185
+ }
186
+ ],
187
+ "metadata": {
188
+ "kernelspec": {
189
+ "display_name": "Python 3 (ipykernel)",
190
+ "language": "python",
191
+ "name": "python3"
192
+ },
193
+ "language_info": {
194
+ "codemirror_mode": {
195
+ "name": "ipython",
196
+ "version": 3
197
+ },
198
+ "file_extension": ".py",
199
+ "mimetype": "text/x-python",
200
+ "name": "python",
201
+ "nbconvert_exporter": "python",
202
+ "pygments_lexer": "ipython3",
203
+ "version": "3.10.13"
204
+ }
205
+ },
206
+ "nbformat": 4,
207
+ "nbformat_minor": 1
208
+ }
notebooks/validate_input_file.ipynb ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "pycharm": {
9
+ "name": "#%% md\n"
10
+ }
11
+ },
12
+ "source": [
13
+ "# Validate analytics JSON\n",
14
+ "\n",
15
+ "### ✅ Prerequisites\n",
16
+ "\n",
17
+ "[Python 3.10](https://www.python.org/downloads/)\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "from typing import Literal\n",
27
+ "import json\n",
28
+ "\n",
29
+ "def read_json(filename: str, encoding=\"utf-8\"):\n",
30
+ " with open(filename, mode=\"r\", encoding=encoding) as fp:\n",
31
+ " return json.load(fp)\n",
32
+ "\n",
33
+ "\n",
34
+ "def is_valid_model(model: dict) -> bool:\n",
35
+ " if \"model_id\" not in model:\n",
36
+ " raise ValueError(f\"Missing mandatory 'model_id' field in {model}\")\n",
37
+ " if \"name\" not in model:\n",
38
+ " raise ValueError(f\"Missing mandatory 'model_id' field in {model}\")\n",
39
+ " if \"owner\" not in model:\n",
40
+ " raise ValueError(f\"Missing mandatory 'model_id' field in {model}\")\n",
41
+ "\n",
42
+ " return True\n",
43
+ "\n",
44
+ "\n",
45
+ "def is_valid_metric(metric: dict) -> bool:\n",
46
+ " def is_valid_metric_value(metric_value: dict) -> bool:\n",
47
+ " # Validate \"value\" field\n",
48
+ " if \"value\" not in metric_value or not metric_value[\"value\"]:\n",
49
+ " raise ValueError(f\"Missing mandatory 'value' field in {metric_value}\")\n",
50
+ "\n",
51
+ " if not (\n",
52
+ " isinstance(metric_value[\"value\"], str)\n",
53
+ " or isinstance(metric_value[\"value\"], float)\n",
54
+ " or isinstance(metric_value[\"value\"], int)\n",
55
+ " ):\n",
56
+ " raise ValueError(\n",
57
+ " f\"Invalid type: {type(metric_value['value'])} for 'value' field in {metric_value}\"\n",
58
+ " )\n",
59
+ "\n",
60
+ " return True\n",
61
+ "\n",
62
+ " # Validate \"name\" field\n",
63
+ " if \"name\" not in metric:\n",
64
+ " raise ValueError(f\"Missing mandatory 'name' field in {metric}\")\n",
65
+ "\n",
66
+ " if not isinstance(metric[\"name\"], str):\n",
67
+ " raise ValueError(\n",
68
+ " f\"Invalid type: {type(metric['name'])} for 'name' field in {metric}\"\n",
69
+ " )\n",
70
+ "\n",
71
+ " # Validate \"author\" field\n",
72
+ " if \"author\" not in metric:\n",
73
+ " raise ValueError(f\"Missing mandatory 'name' field in {metric}\")\n",
74
+ "\n",
75
+ " if not isinstance(metric[\"author\"], str):\n",
76
+ " raise ValueError(\n",
77
+ " f\"Invalid type: {type(metric['author'])} for 'author' field in {metric}\"\n",
78
+ " )\n",
79
+ "\n",
80
+ " if metric[\"author\"] not in [\"human\", \"algorithm\"]:\n",
81
+ " raise ValueError(f\"Unsupported author: {metric['author']} in {metric}\")\n",
82
+ "\n",
83
+ " # Validate \"type\" field\n",
84
+ " if \"type\" not in metric:\n",
85
+ " raise ValueError(f\"Missing mandatory 'type' field in {metric}\")\n",
86
+ "\n",
87
+ " if metric[\"type\"] not in [\"categorical\", \"numerical\", \"text\"]:\n",
88
+ " raise ValueError(f\"Unsupported type: {metric['type']} in {metric}\")\n",
89
+ "\n",
90
+ " # Validate \"categorical\" type metric\n",
91
+ " if metric[\"type\"] == \"categorical\" and (\n",
92
+ " \"values\" not in metric or not metric[\"values\"]\n",
93
+ " ):\n",
94
+ " raise ValueError(\n",
95
+ " f\"Missing mandatory 'values' field for 'categorical' type metric in {metric}\"\n",
96
+ " )\n",
97
+ "\n",
98
+ " if metric[\"type\"] == \"categorical\" and not all(\n",
99
+ " [\n",
100
+ " is_valid_metric_value(metric_value=metric_value)\n",
101
+ " for metric_value in metric[\"values\"]\n",
102
+ " ]\n",
103
+ " ):\n",
104
+ " raise ValueError(\n",
105
+ " f\"Invalid metric values for 'categorical' type of metric in {metric}\"\n",
106
+ " )\n",
107
+ "\n",
108
+ " # Validate \"numerical\" type metric\n",
109
+ " if metric[\"type\"] == \"numerical\" and not (\n",
110
+ " \"range\" in metric or metric[\"range\"] or 2 <= len(metric[\"range\"]) > 3\n",
111
+ " ):\n",
112
+ " raise ValueError(\n",
113
+ " f\"Missing or invalid 'range' field for 'numerical' type of metric in {metric}\"\n",
114
+ " )\n",
115
+ "\n",
116
+ " # Validate \"aggregator\" field\n",
117
+ " if metric[\"type\"] != \"text\" and \"aggregator\" not in metric:\n",
118
+ " raise ValueError(f\"Missing mandatory 'aggregator' field in {metric}\")\n",
119
+ "\n",
120
+ " if metric[\"type\"] == \"numerical\" and metric[\"aggregator\"] != \"average\":\n",
121
+ " raise ValueError(\n",
122
+ " f\"Invalid 'aggregator' field for 'numerical' type of metric in {metric}\"\n",
123
+ " )\n",
124
+ "\n",
125
+ " # Validate 'display_name' field, if present\n",
126
+ " if \"display_name\" in metric and not isinstance(metric[\"display_name\"], str):\n",
127
+ " raise ValueError(\n",
128
+ " f\"Invalid type: {type(metric['display_name'])} for 'display_name' field in {metric}\"\n",
129
+ " )\n",
130
+ "\n",
131
+ " return True\n",
132
+ "\n",
133
+ "\n",
134
+ "def is_valid_document(document: dict) -> bool:\n",
135
+ " # Validate \"document_id\" field\n",
136
+ " if \"document_id\" not in document:\n",
137
+ " raise ValueError(f\"Missing mandatory 'document_id' field in {document}\")\n",
138
+ "\n",
139
+ " if not isinstance(document[\"document_id\"], str):\n",
140
+ " raise ValueError(\n",
141
+ " f\"Invalid type: {type(document['document_id'])} for 'document_id' field in {document}\"\n",
142
+ " )\n",
143
+ "\n",
144
+ " # Validate \"text\" field\n",
145
+ " if \"text\" not in document:\n",
146
+ " raise ValueError(f\"Missing mandatory 'text' field in {document}\")\n",
147
+ "\n",
148
+ " if not isinstance(document[\"text\"], str):\n",
149
+ " raise ValueError(\n",
150
+ " f\"Invalid type: {type(document['text'])} for 'text' field in {document}\"\n",
151
+ " )\n",
152
+ "\n",
153
+ " # Validate 'title' field, if present\n",
154
+ " if \"title\" in document and not isinstance(document[\"title\"], str):\n",
155
+ " raise ValueError(\n",
156
+ " f\"Invalid type: {type(document['title'])} for 'title' field in {document}\"\n",
157
+ " )\n",
158
+ "\n",
159
+ " # Validate 'url' field, if present\n",
160
+ " if \"url\" in document and not isinstance(document[\"url\"], str):\n",
161
+ " raise ValueError(\n",
162
+ " f\"Invalid type: {type(document['url'])} for 'url' field in {document}\"\n",
163
+ " )\n",
164
+ "\n",
165
+ " return True\n",
166
+ "\n",
167
+ "\n",
168
+ "def is_valid_task(task: dict) -> bool:\n",
169
+ " def is_valid_context(context: dict) -> bool:\n",
170
+ " # Validate \"document_id\" field\n",
171
+ " if \"document_id\" not in context:\n",
172
+ " raise ValueError(f\"Missing mandatory 'document_id' field in {context}\")\n",
173
+ "\n",
174
+ " if not isinstance(context[\"document_id\"], str):\n",
175
+ " raise ValueError(\n",
176
+ " f\"Invalid type: {type(context['document_id'])} for 'document_id' field in {context}\"\n",
177
+ " )\n",
178
+ "\n",
179
+ " return True\n",
180
+ "\n",
181
+ " # Validate \"task_id\" field\n",
182
+ " if \"task_id\" not in task:\n",
183
+ " raise ValueError(f\"Missing mandatory 'task_id' field in {task}\")\n",
184
+ "\n",
185
+ " if not isinstance(task[\"task_id\"], str):\n",
186
+ " raise ValueError(\n",
187
+ " f\"Invalid type: {type(task['task_id'])} for 'task_id' field in {task}\"\n",
188
+ " )\n",
189
+ "\n",
190
+ " # Validate \"task_type\" field\n",
191
+ " if \"task_type\" not in task:\n",
192
+ " raise ValueError(f\"Missing mandatory 'task_type' field in {task}\")\n",
193
+ "\n",
194
+ " if not isinstance(task[\"task_type\"], str):\n",
195
+ " raise ValueError(\n",
196
+ " f\"Invalid type: {type(task['task_type'])} for 'task_type' field in {task}\"\n",
197
+ " )\n",
198
+ "\n",
199
+ " if task[\"task_type\"] not in [\"question_answering\", \"conversation\", \"rag\", \"text_generation\", \"json_generation\"]:\n",
200
+ " raise ValueError(f\"Invalid task_type: {task['task_type']} in {task}\")\n",
201
+ "\n",
202
+ " # Validate `contexts` field\n",
203
+ " if not all([is_valid_context(context=context) for context in task[\"contexts\"]]):\n",
204
+ " raise ValueError(f\"Invalid context values in {task}\")\n",
205
+ "\n",
206
+ " return True\n",
207
+ "\n",
208
+ "\n",
209
+ "def is_valid_evaluation(\n",
210
+ " evaluation: dict, metrics: list[str], models: list[str]\n",
211
+ ") -> bool:\n",
212
+ " def is_valid_annotations(annotations: dict, metric: str) -> bool:\n",
213
+ " for annotator_id, rating in annotations.items():\n",
214
+ " if not isinstance(annotator_id, str):\n",
215
+ " raise ValueError(\n",
216
+ " f\"Invalid type: {type(annotator_id)} for 'annotator_id' in {annotations} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
217
+ " )\n",
218
+ "\n",
219
+ " if not isinstance(rating, dict):\n",
220
+ " raise ValueError(\n",
221
+ " f\"Invalid type: {type(rating)} for 'rating' in {annotations} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
222
+ " )\n",
223
+ "\n",
224
+ " # Validate \"task_id\" field\n",
225
+ " if \"value\" not in rating:\n",
226
+ " raise ValueError(\n",
227
+ " f\"Missing mandatory 'value' field in {rating} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
228
+ " )\n",
229
+ "\n",
230
+ " if not (\n",
231
+ " isinstance(rating[\"value\"], str)\n",
232
+ " or isinstance(rating[\"value\"], float)\n",
233
+ " or isinstance(rating[\"value\"], int)\n",
234
+ " ):\n",
235
+ " raise ValueError(\n",
236
+ " f\"Invalid type: {type(rating['value'])} for 'value' in {rating} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
237
+ " )\n",
238
+ "\n",
239
+ " return True\n",
240
+ "\n",
241
+ " # Validate \"task_id\" field\n",
242
+ " if \"task_id\" not in evaluation:\n",
243
+ " raise ValueError(f\"Missing mandatory 'task_id' field in {evaluation}\")\n",
244
+ "\n",
245
+ " if not isinstance(evaluation[\"task_id\"], str):\n",
246
+ " raise ValueError(\n",
247
+ " f\"Invalid type: {type(evaluation['task_id'])} for 'task_id' field in {evaluation}\"\n",
248
+ " )\n",
249
+ "\n",
250
+ " # Validate \"model_id\" field\n",
251
+ " if \"model_id\" not in evaluation:\n",
252
+ " raise ValueError(f\"Missing mandatory 'model_id' field in {evaluation}\")\n",
253
+ "\n",
254
+ " if not isinstance(evaluation[\"model_id\"], str):\n",
255
+ " raise ValueError(\n",
256
+ " f\"Invalid type: {type(evaluation['model_id'])} for 'model_id' field in {evaluation}\"\n",
257
+ " )\n",
258
+ "\n",
259
+ " if evaluation[\"model_id\"] not in models:\n",
260
+ " raise ValueError(\n",
261
+ " f\"Invalid model with model_id: {evaluation['model_id']} for evaluation with task_id: {evaluation['task_id']}\"\n",
262
+ " )\n",
263
+ "\n",
264
+ " # Validate \"model_response\" field\n",
265
+ " if \"task_id\" not in evaluation:\n",
266
+ " raise ValueError(f\"Missing mandatory 'model_response' field in {evaluation}\")\n",
267
+ "\n",
268
+ " if not isinstance(evaluation[\"model_response\"], str):\n",
269
+ " raise ValueError(\n",
270
+ " f\"Invalid type: {type(evaluation['model_response'])} for 'model_response' field in {evaluation}\"\n",
271
+ " )\n",
272
+ "\n",
273
+ " # Validate \"annotations\" field\n",
274
+ " if \"annotations\" not in evaluation:\n",
275
+ " raise ValueError(f\"Missing mandatory 'annotations' field in {evaluation}\")\n",
276
+ "\n",
277
+ " if not all(\n",
278
+ " is_valid_annotations(annotations=annotations, metric=metric)\n",
279
+ " for metric, annotations in evaluation[\"annotations\"].items()\n",
280
+ " ):\n",
281
+ " raise ValueError(\n",
282
+ " f\"Invalid annotations in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
283
+ " )\n",
284
+ "\n",
285
+ " return True\n",
286
+ "\n",
287
+ "\n",
288
+ "def validate(data: dict, level: Literal[\"minimal\", \"aggresive\"] = \"minimal\") -> None:\n",
289
+ " # Validate \"models\" field\n",
290
+ " if \"models\" not in data:\n",
291
+ " raise ValueError(f\"Missing mandatory 'models' field in {data}\")\n",
292
+ "\n",
293
+ " if not all(is_valid_model(model) for model in data[\"models\"]):\n",
294
+ " raise ValueError(f\"Invalid model in {data['models']}\")\n",
295
+ "\n",
296
+ " # Validate \"metrics\" field\n",
297
+ " if \"metrics\" not in data:\n",
298
+ " raise ValueError(f\"Missing mandatory 'metrics' field in {data}\")\n",
299
+ "\n",
300
+ " if not all(is_valid_metric(metric) for metric in data[\"metrics\"]):\n",
301
+ " raise ValueError(f\"Invalid metric in {data['metrics']}\")\n",
302
+ "\n",
303
+ " # Validate \"documents\" field\n",
304
+ " if \"documents\" not in data:\n",
305
+ " raise ValueError(f\"Missing mandatory 'documents' field in {data}\")\n",
306
+ "\n",
307
+ " if not all(is_valid_document(document) for document in data[\"documents\"]):\n",
308
+ " raise ValueError(f\"Invalid document in {data['documents']}\")\n",
309
+ "\n",
310
+ " # Validate \"tasks\" field\n",
311
+ " if \"tasks\" not in data:\n",
312
+ " raise ValueError(f\"Missing mandatory 'tasks' field in {data}\")\n",
313
+ "\n",
314
+ " if not all(is_valid_task(task) for task in data[\"tasks\"]):\n",
315
+ " raise ValueError(f\"Invalid task in {data['tasks']}\")\n",
316
+ "\n",
317
+ " # Warn about duplicate task IDs\n",
318
+ " task_ids = set()\n",
319
+ " for task in data[\"tasks\"]:\n",
320
+ " task_id = task[\"task_id\"]\n",
321
+ " if task_id in task_ids:\n",
322
+ " print(f\"Duplicate task_id: {task_id} found in 'tasks' field\")\n",
323
+ " else:\n",
324
+ " task_ids.add(task_id)\n",
325
+ "\n",
326
+ " # Validate \"evaluations\" field\n",
327
+ " if \"evaluations\" not in data:\n",
328
+ " raise ValueError(f\"Missing mandatory 'evaluations' field in {data}\")\n",
329
+ "\n",
330
+ " applicable_metrics = [metric[\"name\"] for metric in data[\"metrics\"]]\n",
331
+ " applicable_models = [model[\"model_id\"] for model in data[\"models\"]]\n",
332
+ " if not all(\n",
333
+ " is_valid_evaluation(\n",
334
+ " evaluation, metrics=applicable_metrics, models=applicable_models\n",
335
+ " )\n",
336
+ " for evaluation in data[\"evaluations\"]\n",
337
+ " ):\n",
338
+ " raise ValueError(f\"Invalid evaluation in {data['evaluations']}\")\n",
339
+ "\n",
340
+ " # Validate evaluations exists for all task for all models with all metrics\n",
341
+ " evaluated_models_per_task = {}\n",
342
+ " evaluated_metrics_per_model_per_task = {}\n",
343
+ " for evaluation in data[\"evaluations\"]:\n",
344
+ " task_id = evaluation[\"task_id\"]\n",
345
+ " model_id = evaluation[\"model_id\"]\n",
346
+ " try:\n",
347
+ " evaluated_models_per_task[task_id].append(model_id)\n",
348
+ " except KeyError:\n",
349
+ " evaluated_models_per_task[task_id] = [model_id]\n",
350
+ "\n",
351
+ " for metric in evaluation[\"annotations\"].keys():\n",
352
+ " try:\n",
353
+ " evaluated_metrics_per_model_per_task[f\"{task_id}:++:{model_id}\"].append(\n",
354
+ " metric\n",
355
+ " )\n",
356
+ " except KeyError:\n",
357
+ " evaluated_metrics_per_model_per_task[f\"{task_id}:++:{model_id}\"] = [\n",
358
+ " metric\n",
359
+ " ]\n",
360
+ "\n",
361
+ " evaluated_task_ids = set(evaluated_models_per_task.keys())\n",
362
+ " if evaluated_task_ids != task_ids:\n",
363
+ " if len(evaluated_task_ids) > len(task_ids):\n",
364
+ " print(\n",
365
+ " f\"Evaluations found for following additional tasks: {evaluated_task_ids - task_ids}\"\n",
366
+ " )\n",
367
+ " elif len(task_ids) > len(evaluated_task_ids):\n",
368
+ " print(\n",
369
+ " f\"Missing evaluations following tasks: {task_ids - evaluated_task_ids}\"\n",
370
+ " )\n",
371
+ " else:\n",
372
+ " print(\n",
373
+ " f\"Missing evaluations following tasks: {task_ids - evaluated_task_ids}\"\n",
374
+ " )\n",
375
+ " print(\n",
376
+ " f\"Evaluations found for following additional tasks: {evaluated_task_ids - task_ids}\"\n",
377
+ " )\n",
378
+ "\n",
379
+ " evaluations_with_missing_models = {}\n",
380
+ " evaluations_with_additional_models = {}\n",
381
+ " for task_id, models in evaluated_models_per_task.items():\n",
382
+ " if set(models) != set(applicable_models):\n",
383
+ " if set(applicable_models) - set(models):\n",
384
+ " evaluations_with_missing_models[task_id] = set(applicable_models) - set(\n",
385
+ " models\n",
386
+ " )\n",
387
+ " elif set(models) - set(applicable_models):\n",
388
+ " evaluations_with_additional_models[task_id] = set(models) - set(\n",
389
+ " applicable_models\n",
390
+ " )\n",
391
+ "\n",
392
+ " if evaluations_with_missing_models:\n",
393
+ " for task_id, missing_models in evaluations_with_missing_models.items():\n",
394
+ " print(\n",
395
+ " f\"Missing following models: {missing_models} for task with task_id: {task_id}\"\n",
396
+ " )\n",
397
+ "\n",
398
+ " evaluations_per_model_with_missing_metrics = {}\n",
399
+ " evaluations_per_model_with_additional_metrics = {}\n",
400
+ " for key, metrics in evaluated_metrics_per_model_per_task.items():\n",
401
+ " if set(metrics) != set(applicable_metrics):\n",
402
+ " if set(applicable_metrics) - set(metrics):\n",
403
+ " evaluations_per_model_with_missing_metrics[key] = set(\n",
404
+ " applicable_metrics\n",
405
+ " ) - set(metrics)\n",
406
+ " elif set(metrics) - set(applicable_metrics):\n",
407
+ " evaluations_per_model_with_additional_metrics[key] = set(metrics) - set(\n",
408
+ " applicable_metrics\n",
409
+ " )\n",
410
+ "\n",
411
+ " if evaluations_per_model_with_missing_metrics:\n",
412
+ " for key, missing_metrics in evaluations_per_model_with_missing_metrics.items():\n",
413
+ " segments = key.split(\":++:\")\n",
414
+ " print(\n",
415
+ " f\"Missing following metrics: {missing_metrics} for task with task_id: {segments[0]} and model_id: {segments[1]}\"\n",
416
+ " )\n",
417
+ "\n",
418
+ " # Additional checks\n",
419
+ " if level == \"aggresive\":\n",
420
+ " if evaluations_with_additional_models:\n",
421
+ " print(\"====================================================\")\n",
422
+ " print(\"Evaluations with additional models\")\n",
423
+ " print(\"====================================================\")\n",
424
+ " for (\n",
425
+ " task_id,\n",
426
+ " additional_models,\n",
427
+ " ) in evaluations_with_additional_models.items():\n",
428
+ " print(f\"Task ID: {task_id}\\tAdditional models: {additional_models}\")\n",
429
+ "\n",
430
+ " if evaluations_per_model_with_additional_metrics:\n",
431
+ " print(\"====================================================\")\n",
432
+ " print(\"Evaluations with additional metrics\")\n",
433
+ " print(\"====================================================\")\n",
434
+ " for (\n",
435
+ " key,\n",
436
+ " additional_metrics,\n",
437
+ " ) in evaluations_per_model_with_additional_metrics.items():\n",
438
+ " segments = key.split(\":++:\")\n",
439
+ " print(\n",
440
+ " f\"Task ID: {segments[0]}\\tModel: {segments[1]}\\tAdditional metrics: {additional_metrics}\"\n",
441
+ " )"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "markdown",
446
+ "metadata": {},
447
+ "source": [
448
+ "### Run validator\n"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "code",
453
+ "execution_count": null,
454
+ "metadata": {},
455
+ "outputs": [],
456
+ "source": [
457
+ "validate(\n",
458
+ " data=read_json(\n",
459
+ " filename=\"<PATH_TO_INPUT_FILE>\"\n",
460
+ " ),\n",
461
+ " level=\"aggresive\",\n",
462
+ ")"
463
+ ]
464
+ }
465
+ ],
466
+ "metadata": {
467
+ "kernelspec": {
468
+ "display_name": "Python 3 (ipykernel)",
469
+ "language": "python",
470
+ "name": "python3"
471
+ },
472
+ "language_info": {
473
+ "codemirror_mode": {
474
+ "name": "ipython",
475
+ "version": 3
476
+ },
477
+ "file_extension": ".py",
478
+ "mimetype": "text/x-python",
479
+ "name": "python",
480
+ "nbconvert_exporter": "python",
481
+ "pygments_lexer": "ipython3",
482
+ "version": "3.10.13"
483
+ }
484
+ },
485
+ "nbformat": 4,
486
+ "nbformat_minor": 1
487
+ }
src/processor.ts CHANGED
@@ -18,7 +18,7 @@
18
 
19
  import { isEmpty, isNumber } from 'lodash';
20
  import { hash } from '@/src/utilities/strings';
21
-
22
  import {
23
  Data,
24
  MetricValue,
@@ -350,7 +350,15 @@ export function exportData(
350
  documents: data.documents,
351
  }),
352
  tasks: data.tasks,
353
- evaluations: data.evaluations,
 
 
 
 
 
 
 
 
354
  };
355
 
356
  // Step 1: If tasks are defined
@@ -401,9 +409,17 @@ export function exportData(
401
  documents: Array.from(relevantDocuments),
402
  }),
403
  tasks: tasks,
404
- evaluations: data.evaluations.filter((evaluation) =>
405
- relevantTaskIds.has(evaluation.taskId),
406
- ),
 
 
 
 
 
 
 
 
407
  };
408
  } else {
409
  // Step 1.b: Create an object to be exported by copying over tasks information
@@ -416,7 +432,15 @@ export function exportData(
416
  documents: data.documents,
417
  }),
418
  tasks: tasks,
419
- evaluations: data.evaluations,
 
 
 
 
 
 
 
 
420
  };
421
  }
422
  }
@@ -428,7 +452,7 @@ export function exportData(
428
  element.setAttribute(
429
  'href',
430
  'data:application/json;charset=utf-8, ' +
431
- encodeURIComponent(JSON.stringify(dataToExport)),
432
  );
433
  element.setAttribute('download', 'analytics.json');
434
 
 
18
 
19
  import { isEmpty, isNumber } from 'lodash';
20
  import { hash } from '@/src/utilities/strings';
21
+ import { snakeCaseKeys } from '@/src/utilities/objects';
22
  import {
23
  Data,
24
  MetricValue,
 
350
  documents: data.documents,
351
  }),
352
  tasks: data.tasks,
353
+ evaluations: data.evaluations.map((evaluation) => {
354
+ return {
355
+ taskId: evaluation.taskId,
356
+ modelId: evaluation.modelId,
357
+ modelResponse: evaluation.modelResponse,
358
+ annotations: evaluation.annotations,
359
+ ...(evaluation.contexts && { contexts: evaluation.contexts }),
360
+ };
361
+ }),
362
  };
363
 
364
  // Step 1: If tasks are defined
 
409
  documents: Array.from(relevantDocuments),
410
  }),
411
  tasks: tasks,
412
+ evaluations: data.evaluations
413
+ .filter((evaluation) => relevantTaskIds.has(evaluation.taskId))
414
+ .map((evaluation) => {
415
+ return {
416
+ taskId: evaluation.taskId,
417
+ modelId: evaluation.modelId,
418
+ modelResponse: evaluation.modelResponse,
419
+ annotations: evaluation.annotations,
420
+ ...(evaluation.contexts && { contexts: evaluation.contexts }),
421
+ };
422
+ }),
423
  };
424
  } else {
425
  // Step 1.b: Create an object to be exported by copying over tasks information
 
432
  documents: data.documents,
433
  }),
434
  tasks: tasks,
435
+ evaluations: data.evaluations.map((evaluation) => {
436
+ return {
437
+ taskId: evaluation.taskId,
438
+ modelId: evaluation.modelId,
439
+ modelResponse: evaluation.modelResponse,
440
+ annotations: evaluation.annotations,
441
+ ...(evaluation.contexts && { contexts: evaluation.contexts }),
442
+ };
443
+ }),
444
  };
445
  }
446
  }
 
452
  element.setAttribute(
453
  'href',
454
  'data:application/json;charset=utf-8, ' +
455
+ encodeURIComponent(JSON.stringify(snakeCaseKeys(dataToExport))),
456
  );
457
  element.setAttribute('download', 'analytics.json');
458
 
src/types.ts CHANGED
@@ -182,6 +182,7 @@ export interface Annotation {
182
  readonly timestamp?: number;
183
  readonly duration?: number;
184
  }
 
185
  export interface TaskEvaluation {
186
  readonly taskId: string;
187
  readonly modelId: string;
 
182
  readonly timestamp?: number;
183
  readonly duration?: number;
184
  }
185
+
186
  export interface TaskEvaluation {
187
  readonly taskId: string;
188
  readonly modelId: string;
src/utilities/objects.ts CHANGED
@@ -16,7 +16,7 @@
16
  *
17
  **/
18
 
19
- import { camelCase, isPlainObject, isArray, isEmpty } from 'lodash';
20
 
21
  export function camelCaseKeys(
22
  obj: { [key: string]: any },
@@ -52,6 +52,37 @@ export function camelCaseKeys(
52
  return obj;
53
  }
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  function areArraysIntersecting(
56
  a: string | string[],
57
  b: string | string[],
 
16
  *
17
  **/
18
 
19
+ import { camelCase, snakeCase, isPlainObject, isArray, isEmpty } from 'lodash';
20
 
21
  export function camelCaseKeys(
22
  obj: { [key: string]: any },
 
52
  return obj;
53
  }
54
 
55
+ export function snakeCaseKeys(
56
+ obj: { [key: string]: any },
57
+ keys: string[] = [
58
+ 'taskId',
59
+ 'modelId',
60
+ 'modelResponse',
61
+ 'displayValue',
62
+ 'numericValue',
63
+ 'minValue',
64
+ 'maxValue',
65
+ 'taskType',
66
+ 'documentId',
67
+ 'displayName',
68
+ ],
69
+ ) {
70
+ if (isArray(obj)) {
71
+ return obj.map((v) => snakeCaseKeys(v));
72
+ } else if (isPlainObject(obj)) {
73
+ return Object.keys(obj).reduce(
74
+ (result, key) => ({
75
+ ...result,
76
+ ...(keys.includes(key)
77
+ ? { [snakeCase(key)]: snakeCaseKeys(obj[key]) }
78
+ : { [key]: snakeCaseKeys(obj[key]) }),
79
+ }),
80
+ {},
81
+ );
82
+ }
83
+ return obj;
84
+ }
85
+
86
  function areArraysIntersecting(
87
  a: string | string[],
88
  b: string | string[],