diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..c3e1b6e7a83c5205b789014871d63622599c38ad 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3b49c58efc3db325047363ab02eb36b39a4df4d
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.25518818499117035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.020198929979072126}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.06424858701254539, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002540662604602708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.24194520608398293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005061888078063388}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.08713006240052369, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021163575152811052}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.029250897601530847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016159150314360084}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.11213489324648325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031431788617556264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.03968682582435309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012864152483850072}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.061593275111588554, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024488839830169046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2334118422350547, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0048787714459462665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08351980735691193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001978263321627184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06070440310915625, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024423712896334553}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.22684155273254836, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0046468714919551185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.08198324173226955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019772453103279655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..279aa5562f6049ba0c21cb8bfe9a1efc83ac034c
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.37770333810014706, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.036043662376688344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08688444048786848, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030316834829975468}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.25231836055300055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004707834740663268}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11029589045724046, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026302378640336893}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.039888024685296904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019445292240548378}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.12320682798737367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031690473734229127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.050696350493553845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001601172935799986}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.08067935525490526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028191568459744356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2400568623297162, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004443711432592927}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10289898300711169, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002349648102435476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.081852061098128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028519921028834602}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.24140934346009968, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044596632225301245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10416303503161863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024045282990398016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..050fb7686150a79b1f9d69f5eee7478a69a4fdca
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3829018871876502, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.022682302500048476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.09533308838612936, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031918959249739934}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.275555629679834, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004727965781775394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12236842718754035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030225942233901805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.04438815526628443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018839731929603098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.13438684673727228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032179061201762227}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05746066096268111, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018708723435892534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.08478065899787014, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026474348060842394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.25951028904964774, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004388247219710807}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11131192366136156, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025600729892227623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0870012622715152, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002779405775352401}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2612564683466686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004413760869764198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11327636810594607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00265570207245623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cda3958caa93b6915a9aeb1a33f6f61ac9424d0
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.49438795846003614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03220480694116766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.11102978394749591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037258981840235512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.29324047704391387, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004929158066657188}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13661508516896614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033865188061428593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.05489349981744968, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002411444050666127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1460318403788494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0034080826963901003}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.06675560716724324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002152759177258534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0983510025951489, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031311578361653356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.27396911503915883, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004527803475867941}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.12331965784710698, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028547373168783547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.10118232307247621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032732569349238874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2768278339741749, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004591647175413129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1258905544050262, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029667713047718076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..395de540bdc94cb4417405c9db77c1d3990e0b6a
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5097531919668136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037846557427194026}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.12660445313737514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004069129304355684}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.30584134713679634, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004938665654397721}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15051290928420963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035962245168251086}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06380697763757867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025616190740023713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1531395402953802, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033953087678648087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07435510796633933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002254816031736983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.11139297723344309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003440135146254942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.28386785189457364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00447862149321582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.13483336166859466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0030388433299652576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.11506907251192842, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036096573854085593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.287989283624223, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004567845010036214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.13843888381409075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031866033556944046}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..93733e2f1fb3b926ecf26a0a90e3f0597d678067
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5493345438208933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03156507075706306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.13145167799279456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004208017486529403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.31431356632933516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004912309978055554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15523792842972664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003705043227083823}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06682039356587426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026734322118555824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1593483911606193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003488245506035694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07729991696510335, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002362198278661277}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.11521584250075316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0035483893462582828}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2911824630349328, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00449934920077821}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.13867709585369095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031268375342873036}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.11975844765120187, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037510579951047463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.29608094482109, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00457219328093606}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1428199679819932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032823766236227112}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a130351ffbc82982aeb4d4e54b7093f7236b2a4
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.15983607856511065, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025552324547594915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22751168503589744, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030756037547284122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17176517291666785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002329017824094656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03630672989355793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010196024456578525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.052475014894037275, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001428315830366114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03932636998031157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009987340137274396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12519971826411405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019474345957730163}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1828331784278453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002460695997235263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13522236510382263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001715199895595129}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1479787200014008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024084994322680275}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21009537033174597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028681840700951855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15863624899200707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021698433809270567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.2908341082810773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08591697554573632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6cbf0b356075d211058d195b5d28aaffe350aa4
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2074831448217252, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003524413251335538}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1603567541864711, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002358874310291858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1500600406332044, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019241989868486221}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.048193206661683564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020379873823770994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.030329812370434826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011532237790344163}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.02928962923956642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000993776980137765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.16593034599476272, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002952779433549126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1263395938894177, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018438888158069336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.11776814806946785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014763519012599205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.19732631307428722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034023899501457837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.15136030413475798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0021926321158690864}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.14183665338988743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001800224756397767}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.8024864455011091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09003925624860501}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9830ec634a489eba63f9c7596121a1386ce1a738
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.28596147697474456, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040250344618331995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1993639467154353, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002664319477833243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1947292213306491, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021918313174583916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08090209451722337, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024314880977671276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05022891340202324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013490243270254702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.049895782057071895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012048654152258728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22915557741483644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033926337363906784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15804196664911024, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002108622273540943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15366314427734978, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016931718188903863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2703085422322766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003864247203591127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.18781780308286916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025166845226557905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18325876600451932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020600409806209663}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.9479127015875357, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07796616677370145}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fe1bce464c037bee4997bf17fed431a98d8bd1b
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.26271995584299296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004350571081047754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.17195910512668672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028930003693202434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17047407122919256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024381191437714608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07716157594080365, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025215008865493967}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04592292328627891, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014011960521198414}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04580407896668463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001240136840680315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2129040576929948, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036832280907527527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.13748507108736507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023122364311355848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1358429496475672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019120675392480628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.24858343528470517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0041894189677485535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1614787885865035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027151405967413447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.16017361729618249, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022880671169300927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.368296526557304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06715896563436964}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cec243a53f0c20bce96dfb4352f9118005f1d21
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09226808587029187, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035951645035662144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.05378214914394185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021922619377159835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05527677687040673, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002075334915597304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.028532578970617953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018587749480454441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.01548143136364689, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010135388288224974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.015865483046525946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009374894411648104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07725545552877301, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031119081702266804}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.04393223408477368, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018056545763819988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04511311086771814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016959499204821453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08723167275286887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003438050373646056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.05010740359653005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020446842446827317}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05162745766582887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019361443382161162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.03702293319391888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008280750551799155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9f1b692fd1a555bf68a2decfbc41020d1f254fa
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.01624748312843043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017343102214544073}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.008973352017838115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009874748509305067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.008715725437140403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008716993431018379}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.005494197830878659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000991214742335091}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.002558561959956041, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003855003265324329}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.002496646429114283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00036357062963770885}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.014045751930703699, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015619179482028879}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0075965284642917335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000846021103064854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0072939865630402565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007275988588633965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.015553958884576207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001678423693533492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.008515152310636652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009413374007224053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008294111648957665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008329997418165786}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 6.329549235648946e-15, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.98643056969485e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dfd6795a4abb8b2bf99649305fe6a5e34de5e58
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 3.4385508172058055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04264456511068813}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.3745168009518315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0054770341977520625}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.29958589610084396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033801849814915222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.2767986999806955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032477671776355997}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.20297808777795995, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005522822982953256}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.10216853633525805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019971383737075166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.097088577058388, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017064511604233982}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.29931908976323446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005159547193434527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2284780514712017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023634097297777477}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.20603167711573117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002072804178131474}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.34463645738569776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005423162557411763}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.2657387698192473, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003165191193253911}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2467760500253612, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00298962192866607}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1eca0681e4146a3c61cb871e464589fd0f8f7074
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.578509877350227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10981566633879294}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5290497807915124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034031031148912877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.42449982582372703, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029796948529652516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4430699483900025, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024212750351764543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.24509073269483966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026669698385618284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19440129139428716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021684401687693156}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2028107162678877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020310061478079093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.38881065801490833, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029605217311991657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.31090092383731366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002452100105085375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.324119071682265, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020761088086072486}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.43465610390683307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032381620072339475}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.348028905687337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00275713194366831}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.36322533452022043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002355969259137101}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2c79423124483e69790ccefb8cbc92b99eae611
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 13.973742807059493, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21933451006645593}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5463581275257614, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033109746053900106}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4599780026221357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002866451132503217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4734319432946387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002277730687363045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.26984908227570326, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026581113600308783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.22503111854291266, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022246934402712576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23112076704320592, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002029026468778622}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.40847493701179866, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029328594326759366}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3432828614570734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024768383332246274}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.35285117962076507, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002056984438278394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4597128692889795, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003192730444475765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.38674562932968487, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002735034124705561}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3979842693641427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002298871731116783}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c3a6dc82fcb764283aa7e2f2c3b44f051f35945
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.533287312181702, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18656720619703301}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5451265264934796, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003147288201601649}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4635452060391057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002820441604942112}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.478142190779417, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002235263328285793}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2714328734244194, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002583914662030242}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23027470724704344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022949497645592134}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23634052735419925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020601993739155743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.40976835157500135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002821032707227765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3479477017249916, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002470400553707357}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3585015271693414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002048646341232902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.46121561084482127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003066679903899239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3924920651465138, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027314711241234945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.404637798062588, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002300343226701871}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..158e5f09e47ceb2e20027a9f8a308256c03a7b52
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.788126595166384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18690040009176595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5482226243244983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032124524377117303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.46345359030928057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028329342309959572}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.48045922670165075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002311569925954736}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.27446812632372075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026299321970613995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2307927377768704, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002303932783753952}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23850438072833094, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002104880750018495}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.41268229797839345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002885772570892295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34835409226615954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024733395312169725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.36091724958116495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021212744636126225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.46563464741217186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031424166236039795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39431106312311454, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027657116569750795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4083787027633123, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002378318389395946}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a95513b68fffa51821ce7f700771689bbecd1d37
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.816898114308659, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2578943709075645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5463965076188966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031073173050291266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.464786934938444, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027602961923400638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4813258409710542, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022155647522670184}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2741684381980198, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002611976915331882}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23130694014233213, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002261839435649896}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23912604624839506, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002085017928004898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4127791623423059, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027864194329252517}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35037713824014954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002413674972879658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3627614127879427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002049692110741263}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4665920267581706, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030279579394666637}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3970310588904633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026745772949146518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.41096147149083767, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022723564955560572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..17df8be1d9fcc555d15adefaf7c66d12201668ef
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13515325881990387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001988006386809129}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3082267363959881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004580722609646003}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18486846991878378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026472370081744873}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0272859934174315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000998594442193996}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06551213025624243, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024545590859311856}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03791043043165935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013771627135441805}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10194380290336473, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014645033080314515}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.23262788209228538, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003415563039987054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.13935422276606838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019388171493750941}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10728605862144046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016390380124941084}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.24560801323409917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038465518145899083}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.14683463366193603, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021884358384104898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.42436283894258, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0809652794130641}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..550ba355d38f3d32e4b219d94dc978ccbec22099
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.18034557721867206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003378966024270665}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.25165661054563454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004219363793342944}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19377928742866415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029318979888479386}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03174129360340327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016980099731496725}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.04525380270312792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021325289082326157}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03391705069355955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001600827686299807}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1348915392123015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002633227095382269}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18737022577489543, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031416529379445714}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14433798493238226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022145818855320775}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1387175358944472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026549430237383285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.19658579721494118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003531925441008354}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.14964383560266273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002331396812470961}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5072643038464826, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07581189432866778}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5a7a5a3e791126c9bd19c707d69e0a21bf0b082
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.22068687589610897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003789951247255284}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.25078858709670043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039415090299140245}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21899687156092093, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031785578491904183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04612059160046461, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00216246272019255}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05136702902319479, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002315418839001681}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04511500976844758, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001986723782931247}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1666654493818604, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002931741970841584}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.19015440175110393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030722371403492073}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16528451206309597, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002448179539908828}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1684803863889924, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029515578981164007}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.19342460055959376, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032586706636470127}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16744440121325288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024991880630578747}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.135517530489009, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09395977154683074}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d5c261369348c364bad41220a6187a9a37f84
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.21755397454942801, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040197643665198}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.24395196225361002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0041819563633223285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21441285271309635, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034688162829816867}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04608502015312911, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021286532637906437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05204692726094356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002308596850566173}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0455324464868029, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019857349620838816}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.16247030281934602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031264236020145395}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18178458588619653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031938440097234347}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.159773564475991, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002683043459624172}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16495789870371907, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031401120246905712}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.18683194228664585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034187194500331993}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16297000957614646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002742910817454915}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.306953746485347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08640815655966855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5752947d731521da9d8a61e042e07a9f0e3246ff
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.05735338761210453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035071456355189675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.059021496533637884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036712685384006774}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05269608047169612, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003098859012482462}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.010781643643676288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011947055672951296}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01223156205613527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013826375879066437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.010458434372266581, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011178203055454239}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.04368080562194709, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002762386413099355}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04437717860143795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028397295075813945}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.03966278292494188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023832058820488743}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.04449065152016272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002797505308089145}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04583958323736499, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029805207884232624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04062836833112145, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024442501537786706}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.21661324901347828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07403289458610106}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a63823a46177f89394c44da0c9e7b0c9bdd5203a
--- /dev/null
+++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.003544629362742575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009485977995739016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0037550498645595996, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011352337360716696}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0028901136641309804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007628453825537149}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0006426177000561377, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00026640442181651204}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0013262549819153596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006559432301667889}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0007826161154573944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003313480806525455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.002882853322968734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007611952222133634}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.003139944055164728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009749927515076235}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.002389689764554573, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006340237380334276}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002902333778437242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007672460026879215}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0031769694879492582, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010086213776490023}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.00241629675514098, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006494890213851666}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5974310709826073e-21, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.3100992922519676e-15}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a41a8a4c1acb8022efbfb60b43b30c7270084159
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2451355dbc6ecf61ecf6f46b5f1533a0f35e15062fefc4af6d5ace09dd81e825
+size 3982993
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a85dc2467ef025e9862187fb0e6049ac01592c1
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23178510788e0243ab26492b29e3671ce8ca27db2d17e238a6c4bedf0ee22047
+size 4847367
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4295d648f8ceaa61ea810839eeedf49a0170c24c
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9aef74c3ffb6b3f4d2f16584e518b29506b7412fceef949dbac8fdd712638d8
+size 5811137
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9b911ffbe9be177c86b054a4af8ef079203657ea
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c20731be143fe20dbbe77d164b4db9ca631e3da5e9f5ddcad4b12a512bcf5015
+size 6700805
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..27289802d6deb6ee2116d2cfe3aacfba24eceeaa
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ca4818a55ccfccc013eadd26b08d5f7f44ceabb8f07e7435e50a1d715db8d84
+size 7580127
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ce5b0d423f7b54af3573bd001d09db1c0a91b323
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f629a9199998eda2cf2bee34e15475fab7ea92a6727ef1cba82d8810878fc053
+size 8462227
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8d245f6fa6ba003a7c65db56d822592b20fa992
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f25b37c336043fa5c942f06070dabae3eda2634075036d8bb7c76b4da22bc35
+size 7589559
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..214e4bc207bc16f43c710b3f03535642e11d9182
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9899f03ed89bbc592e12294b60c071b64190245407a3df24298ef4cb761bd01a
+size 12969553
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..389012d27858b97239a08fb2d528fa9862a43a83
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0564614129e53accb30341237f422a050c5f39b64c4ca37a0eeb3843f97b44ad
+size 18545638
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b09a5b4026a0890bb513c2fe60f7c71aade15bce
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22737aec22a24fe6567f7e22b9ab268bc577dbab3b2524725fbd71ee2a11ec0d
+size 24016258
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4018ce40eadb01ff48fb3ba8792a2c904b00333d
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bd56c4b2e1ff5acd018879af81d0ec33f603367a67c7f50f2ec8e4e50736060
+size 29355453
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1aa5dc3aa9ee3f41dfd4fc347e810be6c82f0529
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8004d231e833bafa9d38c63bfbf62d1e9c6a0b982e16d3f716255f018b7dff53
+size 34782772
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aa6222a010f18f1d9b42e22cd856b8986e60c119
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bbac395e2abf05b8b7c689c78354e489494bf791daf5256784a57947cf7972e
+size 4217659
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..abe3637b815e9481d0343327113523960d894db1
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8ac3cc5ae134bf0187190eb779207a918003fb8f379af03a88a22534796fff1
+size 5028135
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..44646e5534b19453444cabdd7215095b12a4c6fa
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f092e42a367c56fbe377d777c6c18ef8a1928efdd88651077c33ed7927c3b43
+size 6122993
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f21291b86c1efcbf055aba82ee3fd7f344339ee7
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dd34d8eea7e2b960334cca235ea55528556eb83dbb2cd7744d9daba6fb751e4
+size 7200825
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8289c1874190eff2191578ccc62fe3f6f178c5e9
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2296383a810f92aa74a80a07a7e5d67d4adac62484c7feecbc93531f70df0b1c
+size 8275191
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd0bd3503c6085ade28e02a9c4a45c7aa7fb6d85
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04584777b039d65cac9d9d98855a403e6fe8a9f260a2650ce50bbfec5a7912e7
+size 9356862
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca34961958e6094f9edf4b736cd94f736714613b
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9cfcf8eef08371fd49572807910296aa903cf721830dbc4df77e605be85ee2c
+size 2819469
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bc476ad151cf5bcf8b0c0c8bf960b435efda3e8f
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ea84d7e818dc8c98f494d44ac84ce557a5d34a2b6d9e6a8741071ba146694bc
+size 4982442
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a03fbe35db94e1620e797de640c9d9412de4a9ee
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2692fb1e98d5a9704d5ee926d8424d33b27a10b79ef4f8fad33afb476cc34e9
+size 7217364
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0b3efd82baaf56ab0fe6029fe16ce6de3bd6999
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f504bfb2a2b0015a174b6f17d7e792302307943f5b470cafba4d9630bff7a33b
+size 9495458
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..be8a7a030559dffe658e30b5269214fb713e78ea
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e5d9474f2f265f545a5671d21013a7a7e0fbfe1aed37fb8e753f8475a10a649
+size 11637450
diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e895bccdffeec724f4f4c7930e4799f9be936d09
--- /dev/null
+++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eed5c618b2c15d3ed7c38c0651658c75f055dee97f605df453c271d9e3769a41
+size 13865437
diff --git a/evaluation/generation/merged.csv b/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3331c0e795eab19509b38bb7723dec470c7a3e6d
--- /dev/null
+++ b/evaluation/generation/merged.csv
@@ -0,0 +1,53 @@
+dataset,fewshots,prompt,metric,value
+e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.097088577058388
+e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.097088577058388
+e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2028107162678877
+e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2028107162678877
+e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.23112076704320592
+e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.23112076704320592
+e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23634052735419925
+e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23634052735419925
+e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23850438072833094
+e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23850438072833094
+e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23912604624839506
+e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23912604624839506
+e2e_nlg_cleaned,5,average,multiple,0.20749850245006782
+gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03791043043165935
+gem_xsum,0,median,rouge2_fmeasure,0.03791043043165935
+gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03391705069355955
+gem_xsum,1,median,rouge2_fmeasure,0.03391705069355955
+gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04511500976844758
+gem_xsum,2,median,rouge2_fmeasure,0.04511500976844758
+gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.0455324464868029
+gem_xsum,3,median,rouge2_fmeasure,0.0455324464868029
+gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010458434372266581
+gem_xsum,4,median,rouge2_fmeasure,0.010458434372266581
+gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0007826161154573944
+gem_xsum,5,median,rouge2_fmeasure,0.0007826161154573944
+gem_xsum,5,average,multiple,0.02895266464469889
+web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.03968682582435309
+web_nlg_en,0,median,rouge2_fmeasure,0.03968682582435309
+web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.050696350493553845
+web_nlg_en,1,median,rouge2_fmeasure,0.050696350493553845
+web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05746066096268111
+web_nlg_en,2,median,rouge2_fmeasure,0.05746066096268111
+web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.06675560716724324
+web_nlg_en,3,median,rouge2_fmeasure,0.06675560716724324
+web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.07435510796633933
+web_nlg_en,4,median,rouge2_fmeasure,0.07435510796633933
+web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.07729991696510335
+web_nlg_en,5,median,rouge2_fmeasure,0.07729991696510335
+web_nlg_en,5,average,multiple,0.06104241156321233
+wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03932636998031157
+wiki_lingua_en,0,median,rouge2_fmeasure,0.03932636998031157
+wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.02928962923956642
+wiki_lingua_en,1,median,rouge2_fmeasure,0.02928962923956642
+wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.049895782057071895
+wiki_lingua_en,2,median,rouge2_fmeasure,0.049895782057071895
+wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04580407896668463
+wiki_lingua_en,3,median,rouge2_fmeasure,0.04580407896668463
+wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015865483046525946
+wiki_lingua_en,4,median,rouge2_fmeasure,0.015865483046525946
+wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002496646429114283
+wiki_lingua_en,5,median,rouge2_fmeasure,0.002496646429114283
+wiki_lingua_en,5,average,multiple,0.030446331619879124
diff --git a/evaluation/generation/merged.json b/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..5385baaada544a386d2284a7aa9fab2b6fb4c218
--- /dev/null
+++ b/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.25518818499117035, "bleu_stderr": 0.020198929979072126, "rouge1_fmeasure": 0.08713006240052369, "rouge1_fmeasure_stderr": 0.0021163575152811052, "rouge1_precision": 0.06424858701254539, "rouge1_precision_stderr": 0.002540662604602708, "rouge1_recall": 0.24194520608398293, "rouge1_recall_stderr": 0.005061888078063388, "rouge2_fmeasure": 0.03968682582435309, "rouge2_fmeasure_stderr": 0.0012864152483850072, "rouge2_precision": 0.029250897601530847, "rouge2_precision_stderr": 0.0016159150314360084, "rouge2_recall": 0.11213489324648325, "rouge2_recall_stderr": 0.0031431788617556264, "rougeL_fmeasure": 0.08351980735691193, "rougeL_fmeasure_stderr": 0.001978263321627184, "rougeL_precision": 0.061593275111588554, "rougeL_precision_stderr": 0.0024488839830169046, "rougeL_recall": 0.2334118422350547, "rougeL_recall_stderr": 0.0048787714459462665, "rougeLsum_fmeasure": 0.08198324173226955, "rougeLsum_fmeasure_stderr": 0.0019772453103279655, "rougeLsum_precision": 0.06070440310915625, "rougeLsum_precision_stderr": 0.0024423712896334553, "rougeLsum_recall": 0.22684155273254836, "rougeLsum_recall_stderr": 0.0046468714919551185}}, "1": {"PALM_prompt": {"bleu": 0.37770333810014706, "bleu_stderr": 0.036043662376688344, "rouge1_fmeasure": 0.11029589045724046, "rouge1_fmeasure_stderr": 0.0026302378640336893, "rouge1_precision": 0.08688444048786848, "rouge1_precision_stderr": 0.0030316834829975468, "rouge1_recall": 0.25231836055300055, "rouge1_recall_stderr": 0.004707834740663268, "rouge2_fmeasure": 0.050696350493553845, "rouge2_fmeasure_stderr": 0.001601172935799986, "rouge2_precision": 0.039888024685296904, "rouge2_precision_stderr": 0.0019445292240548378, "rouge2_recall": 0.12320682798737367, "rouge2_recall_stderr": 0.0031690473734229127, "rougeL_fmeasure": 0.10289898300711169, "rougeL_fmeasure_stderr": 0.002349648102435476, "rougeL_precision": 0.08067935525490526, "rougeL_precision_stderr": 0.0028191568459744356, "rougeL_recall": 0.2400568623297162, "rougeL_recall_stderr": 0.004443711432592927, "rougeLsum_fmeasure": 0.10416303503161863, "rougeLsum_fmeasure_stderr": 0.0024045282990398016, "rougeLsum_precision": 0.081852061098128, "rougeLsum_precision_stderr": 0.0028519921028834602, "rougeLsum_recall": 0.24140934346009968, "rougeLsum_recall_stderr": 0.0044596632225301245}}, "2": {"PALM_prompt": {"bleu": 0.3829018871876502, "bleu_stderr": 0.022682302500048476, "rouge1_fmeasure": 0.12236842718754035, "rouge1_fmeasure_stderr": 0.0030225942233901805, "rouge1_precision": 0.09533308838612936, "rouge1_precision_stderr": 0.0031918959249739934, "rouge1_recall": 0.275555629679834, "rouge1_recall_stderr": 0.004727965781775394, "rouge2_fmeasure": 0.05746066096268111, "rouge2_fmeasure_stderr": 0.0018708723435892534, "rouge2_precision": 0.04438815526628443, "rouge2_precision_stderr": 0.0018839731929603098, "rouge2_recall": 0.13438684673727228, "rouge2_recall_stderr": 0.0032179061201762227, "rougeL_fmeasure": 0.11131192366136156, "rougeL_fmeasure_stderr": 0.0025600729892227623, "rougeL_precision": 0.08478065899787014, "rougeL_precision_stderr": 0.0026474348060842394, "rougeL_recall": 0.25951028904964774, "rougeL_recall_stderr": 0.004388247219710807, "rougeLsum_fmeasure": 0.11327636810594607, "rougeLsum_fmeasure_stderr": 0.00265570207245623, "rougeLsum_precision": 0.0870012622715152, "rougeLsum_precision_stderr": 0.002779405775352401, "rougeLsum_recall": 0.2612564683466686, "rougeLsum_recall_stderr": 0.004413760869764198}}, "3": {"PALM_prompt": {"bleu": 0.49438795846003614, "bleu_stderr": 0.03220480694116766, "rouge1_fmeasure": 0.13661508516896614, "rouge1_fmeasure_stderr": 0.0033865188061428593, "rouge1_precision": 0.11102978394749591, "rouge1_precision_stderr": 0.0037258981840235512, "rouge1_recall": 0.29324047704391387, "rouge1_recall_stderr": 0.004929158066657188, "rouge2_fmeasure": 0.06675560716724324, "rouge2_fmeasure_stderr": 0.002152759177258534, "rouge2_precision": 0.05489349981744968, "rouge2_precision_stderr": 0.002411444050666127, "rouge2_recall": 0.1460318403788494, "rouge2_recall_stderr": 0.0034080826963901003, "rougeL_fmeasure": 0.12331965784710698, "rougeL_fmeasure_stderr": 0.0028547373168783547, "rougeL_precision": 0.0983510025951489, "rougeL_precision_stderr": 0.0031311578361653356, "rougeL_recall": 0.27396911503915883, "rougeL_recall_stderr": 0.004527803475867941, "rougeLsum_fmeasure": 0.1258905544050262, "rougeLsum_fmeasure_stderr": 0.0029667713047718076, "rougeLsum_precision": 0.10118232307247621, "rougeLsum_precision_stderr": 0.0032732569349238874, "rougeLsum_recall": 0.2768278339741749, "rougeLsum_recall_stderr": 0.004591647175413129}}, "4": {"PALM_prompt": {"bleu": 0.5097531919668136, "bleu_stderr": 0.037846557427194026, "rouge1_fmeasure": 0.15051290928420963, "rouge1_fmeasure_stderr": 0.0035962245168251086, "rouge1_precision": 0.12660445313737514, "rouge1_precision_stderr": 0.004069129304355684, "rouge1_recall": 0.30584134713679634, "rouge1_recall_stderr": 0.004938665654397721, "rouge2_fmeasure": 0.07435510796633933, "rouge2_fmeasure_stderr": 0.002254816031736983, "rouge2_precision": 0.06380697763757867, "rouge2_precision_stderr": 0.0025616190740023713, "rouge2_recall": 0.1531395402953802, "rouge2_recall_stderr": 0.0033953087678648087, "rougeL_fmeasure": 0.13483336166859466, "rougeL_fmeasure_stderr": 0.0030388433299652576, "rougeL_precision": 0.11139297723344309, "rougeL_precision_stderr": 0.003440135146254942, "rougeL_recall": 0.28386785189457364, "rougeL_recall_stderr": 0.00447862149321582, "rougeLsum_fmeasure": 0.13843888381409075, "rougeLsum_fmeasure_stderr": 0.0031866033556944046, "rougeLsum_precision": 0.11506907251192842, "rougeLsum_precision_stderr": 0.0036096573854085593, "rougeLsum_recall": 0.287989283624223, "rougeLsum_recall_stderr": 0.004567845010036214}}, "5": {"PALM_prompt": {"bleu": 0.5493345438208933, "bleu_stderr": 0.03156507075706306, "rouge1_fmeasure": 0.15523792842972664, "rouge1_fmeasure_stderr": 0.003705043227083823, "rouge1_precision": 0.13145167799279456, "rouge1_precision_stderr": 0.004208017486529403, "rouge1_recall": 0.31431356632933516, "rouge1_recall_stderr": 0.004912309978055554, "rouge2_fmeasure": 0.07729991696510335, "rouge2_fmeasure_stderr": 0.002362198278661277, "rouge2_precision": 0.06682039356587426, "rouge2_precision_stderr": 0.0026734322118555824, "rouge2_recall": 0.1593483911606193, "rouge2_recall_stderr": 0.003488245506035694, "rougeL_fmeasure": 0.13867709585369095, "rougeL_fmeasure_stderr": 0.0031268375342873036, "rougeL_precision": 0.11521584250075316, "rougeL_precision_stderr": 0.0035483893462582828, "rougeL_recall": 0.2911824630349328, "rougeL_recall_stderr": 0.00449934920077821, "rougeLsum_fmeasure": 0.1428199679819932, "rougeLsum_fmeasure_stderr": 0.0032823766236227112, "rougeLsum_precision": 0.11975844765120187, "rougeLsum_precision_stderr": 0.0037510579951047463, "rougeLsum_recall": 0.29608094482109, "rougeLsum_recall_stderr": 0.00457219328093606}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.2908341082810773, "bleu_stderr": 0.08591697554573632, "rouge1_fmeasure": 0.17176517291666785, "rouge1_fmeasure_stderr": 0.002329017824094656, "rouge1_precision": 0.15983607856511065, "rouge1_precision_stderr": 0.0025552324547594915, "rouge1_recall": 0.22751168503589744, "rouge1_recall_stderr": 0.0030756037547284122, "rouge2_fmeasure": 0.03932636998031157, "rouge2_fmeasure_stderr": 0.0009987340137274396, "rouge2_precision": 0.03630672989355793, "rouge2_precision_stderr": 0.0010196024456578525, "rouge2_recall": 0.052475014894037275, "rouge2_recall_stderr": 0.001428315830366114, "rougeL_fmeasure": 0.13522236510382263, "rougeL_fmeasure_stderr": 0.001715199895595129, "rougeL_precision": 0.12519971826411405, "rougeL_precision_stderr": 0.0019474345957730163, "rougeL_recall": 0.1828331784278453, "rougeL_recall_stderr": 0.002460695997235263, "rougeLsum_fmeasure": 0.15863624899200707, "rougeLsum_fmeasure_stderr": 0.0021698433809270567, "rougeLsum_precision": 0.1479787200014008, "rougeLsum_precision_stderr": 0.0024084994322680275, "rougeLsum_recall": 0.21009537033174597, "rougeLsum_recall_stderr": 0.0028681840700951855}}, "1": {"tldr_en": {"bleu": 1.8024864455011091, "bleu_stderr": 0.09003925624860501, "rouge1_fmeasure": 0.1500600406332044, "rouge1_fmeasure_stderr": 0.0019241989868486221, "rouge1_precision": 0.2074831448217252, "rouge1_precision_stderr": 0.003524413251335538, "rouge1_recall": 0.1603567541864711, "rouge1_recall_stderr": 0.002358874310291858, "rouge2_fmeasure": 0.02928962923956642, "rouge2_fmeasure_stderr": 0.000993776980137765, "rouge2_precision": 0.048193206661683564, "rouge2_precision_stderr": 0.0020379873823770994, "rouge2_recall": 0.030329812370434826, "rouge2_recall_stderr": 0.0011532237790344163, "rougeL_fmeasure": 0.11776814806946785, "rougeL_fmeasure_stderr": 0.0014763519012599205, "rougeL_precision": 0.16593034599476272, "rougeL_precision_stderr": 0.002952779433549126, "rougeL_recall": 0.1263395938894177, "rougeL_recall_stderr": 0.0018438888158069336, "rougeLsum_fmeasure": 0.14183665338988743, "rougeLsum_fmeasure_stderr": 0.001800224756397767, "rougeLsum_precision": 0.19732631307428722, "rougeLsum_precision_stderr": 0.0034023899501457837, "rougeLsum_recall": 0.15136030413475798, "rougeLsum_recall_stderr": 0.0021926321158690864}}, "2": {"tldr_en": {"bleu": 2.9479127015875357, "bleu_stderr": 0.07796616677370145, "rouge1_fmeasure": 0.1947292213306491, "rouge1_fmeasure_stderr": 0.0021918313174583916, "rouge1_precision": 0.28596147697474456, "rouge1_precision_stderr": 0.0040250344618331995, "rouge1_recall": 0.1993639467154353, "rouge1_recall_stderr": 0.002664319477833243, "rouge2_fmeasure": 0.049895782057071895, "rouge2_fmeasure_stderr": 0.0012048654152258728, "rouge2_precision": 0.08090209451722337, "rouge2_precision_stderr": 0.0024314880977671276, "rouge2_recall": 0.05022891340202324, "rouge2_recall_stderr": 0.0013490243270254702, "rougeL_fmeasure": 0.15366314427734978, "rougeL_fmeasure_stderr": 0.0016931718188903863, "rougeL_precision": 0.22915557741483644, "rougeL_precision_stderr": 0.0033926337363906784, "rougeL_recall": 0.15804196664911024, "rougeL_recall_stderr": 0.002108622273540943, "rougeLsum_fmeasure": 0.18325876600451932, "rougeLsum_fmeasure_stderr": 0.0020600409806209663, "rougeLsum_precision": 0.2703085422322766, "rougeLsum_precision_stderr": 0.003864247203591127, "rougeLsum_recall": 0.18781780308286916, "rougeLsum_recall_stderr": 0.0025166845226557905}}, "3": {"tldr_en": {"bleu": 2.368296526557304, "bleu_stderr": 0.06715896563436964, "rouge1_fmeasure": 0.17047407122919256, "rouge1_fmeasure_stderr": 0.0024381191437714608, "rouge1_precision": 0.26271995584299296, "rouge1_precision_stderr": 0.004350571081047754, "rouge1_recall": 0.17195910512668672, "rouge1_recall_stderr": 0.0028930003693202434, "rouge2_fmeasure": 0.04580407896668463, "rouge2_fmeasure_stderr": 0.001240136840680315, "rouge2_precision": 0.07716157594080365, "rouge2_precision_stderr": 0.0025215008865493967, "rouge2_recall": 0.04592292328627891, "rouge2_recall_stderr": 0.0014011960521198414, "rougeL_fmeasure": 0.1358429496475672, "rougeL_fmeasure_stderr": 0.0019120675392480628, "rougeL_precision": 0.2129040576929948, "rougeL_precision_stderr": 0.0036832280907527527, "rougeL_recall": 0.13748507108736507, "rougeL_recall_stderr": 0.0023122364311355848, "rougeLsum_fmeasure": 0.16017361729618249, "rougeLsum_fmeasure_stderr": 0.0022880671169300927, "rougeLsum_precision": 0.24858343528470517, "rougeLsum_precision_stderr": 0.0041894189677485535, "rougeLsum_recall": 0.1614787885865035, "rougeLsum_recall_stderr": 0.0027151405967413447}}, "4": {"tldr_en": {"bleu": 0.03702293319391888, "bleu_stderr": 0.008280750551799155, "rouge1_fmeasure": 0.05527677687040673, "rouge1_fmeasure_stderr": 0.002075334915597304, "rouge1_precision": 0.09226808587029187, "rouge1_precision_stderr": 0.0035951645035662144, "rouge1_recall": 0.05378214914394185, "rouge1_recall_stderr": 0.0021922619377159835, "rouge2_fmeasure": 0.015865483046525946, "rouge2_fmeasure_stderr": 0.0009374894411648104, "rouge2_precision": 0.028532578970617953, "rouge2_precision_stderr": 0.0018587749480454441, "rouge2_recall": 0.01548143136364689, "rouge2_recall_stderr": 0.0010135388288224974, "rougeL_fmeasure": 0.04511311086771814, "rougeL_fmeasure_stderr": 0.0016959499204821453, "rougeL_precision": 0.07725545552877301, "rougeL_precision_stderr": 0.0031119081702266804, "rougeL_recall": 0.04393223408477368, "rougeL_recall_stderr": 0.0018056545763819988, "rougeLsum_fmeasure": 0.05162745766582887, "rougeLsum_fmeasure_stderr": 0.0019361443382161162, "rougeLsum_precision": 0.08723167275286887, "rougeLsum_precision_stderr": 0.003438050373646056, "rougeLsum_recall": 0.05010740359653005, "rougeLsum_recall_stderr": 0.0020446842446827317}}, "5": {"tldr_en": {"bleu": 6.329549235648946e-15, "bleu_stderr": 6.98643056969485e-14, "rouge1_fmeasure": 0.008715725437140403, "rouge1_fmeasure_stderr": 0.0008716993431018379, "rouge1_precision": 0.01624748312843043, "rouge1_precision_stderr": 0.0017343102214544073, "rouge1_recall": 0.008973352017838115, "rouge1_recall_stderr": 0.0009874748509305067, "rouge2_fmeasure": 0.002496646429114283, "rouge2_fmeasure_stderr": 0.00036357062963770885, "rouge2_precision": 0.005494197830878659, "rouge2_precision_stderr": 0.000991214742335091, "rouge2_recall": 0.002558561959956041, "rouge2_recall_stderr": 0.0003855003265324329, "rougeL_fmeasure": 0.0072939865630402565, "rougeL_fmeasure_stderr": 0.0007275988588633965, "rougeL_precision": 0.014045751930703699, "rougeL_precision_stderr": 0.0015619179482028879, "rougeL_recall": 0.0075965284642917335, "rougeL_recall_stderr": 0.000846021103064854, "rougeLsum_fmeasure": 0.008294111648957665, "rougeLsum_fmeasure_stderr": 0.0008329997418165786, "rougeLsum_precision": 0.015553958884576207, "rougeLsum_precision_stderr": 0.001678423693533492, "rougeLsum_recall": 0.008515152310636652, "rougeLsum_recall_stderr": 0.0009413374007224053}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.4385508172058055, "bleu_stderr": 0.04264456511068813, "rouge1_fmeasure": 0.2767986999806955, "rouge1_fmeasure_stderr": 0.0032477671776355997, "rouge1_precision": 0.3745168009518315, "rouge1_precision_stderr": 0.0054770341977520625, "rouge1_recall": 0.29958589610084396, "rouge1_recall_stderr": 0.0033801849814915222, "rouge2_fmeasure": 0.097088577058388, "rouge2_fmeasure_stderr": 0.0017064511604233982, "rouge2_precision": 0.20297808777795995, "rouge2_precision_stderr": 0.005522822982953256, "rouge2_recall": 0.10216853633525805, "rouge2_recall_stderr": 0.0019971383737075166, "rougeL_fmeasure": 0.20603167711573117, "rougeL_fmeasure_stderr": 0.002072804178131474, "rougeL_precision": 0.29931908976323446, "rougeL_precision_stderr": 0.005159547193434527, "rougeL_recall": 0.2284780514712017, "rougeL_recall_stderr": 0.0023634097297777477, "rougeLsum_fmeasure": 0.2467760500253612, "rougeLsum_fmeasure_stderr": 0.00298962192866607, "rougeLsum_precision": 0.34463645738569776, "rougeLsum_precision_stderr": 0.005423162557411763, "rougeLsum_recall": 0.2657387698192473, "rougeLsum_recall_stderr": 0.003165191193253911}}, "1": {"generate_text_restaurant": {"bleu": 11.578509877350227, "bleu_stderr": 0.10981566633879294, "rouge1_fmeasure": 0.4430699483900025, "rouge1_fmeasure_stderr": 0.0024212750351764543, "rouge1_precision": 0.5290497807915124, "rouge1_precision_stderr": 0.0034031031148912877, "rouge1_recall": 0.42449982582372703, "rouge1_recall_stderr": 0.0029796948529652516, "rouge2_fmeasure": 0.2028107162678877, "rouge2_fmeasure_stderr": 0.0020310061478079093, "rouge2_precision": 0.24509073269483966, "rouge2_precision_stderr": 0.0026669698385618284, "rouge2_recall": 0.19440129139428716, "rouge2_recall_stderr": 0.0021684401687693156, "rougeL_fmeasure": 0.324119071682265, "rougeL_fmeasure_stderr": 0.0020761088086072486, "rougeL_precision": 0.38881065801490833, "rougeL_precision_stderr": 0.0029605217311991657, "rougeL_recall": 0.31090092383731366, "rougeL_recall_stderr": 0.002452100105085375, "rougeLsum_fmeasure": 0.36322533452022043, "rougeLsum_fmeasure_stderr": 0.002355969259137101, "rougeLsum_precision": 0.43465610390683307, "rougeLsum_precision_stderr": 0.0032381620072339475, "rougeLsum_recall": 0.348028905687337, "rougeLsum_recall_stderr": 0.00275713194366831}}, "2": {"generate_text_restaurant": {"bleu": 13.973742807059493, "bleu_stderr": 0.21933451006645593, "rouge1_fmeasure": 0.4734319432946387, "rouge1_fmeasure_stderr": 0.002277730687363045, "rouge1_precision": 0.5463581275257614, "rouge1_precision_stderr": 0.0033109746053900106, "rouge1_recall": 0.4599780026221357, "rouge1_recall_stderr": 0.002866451132503217, "rouge2_fmeasure": 0.23112076704320592, "rouge2_fmeasure_stderr": 0.002029026468778622, "rouge2_precision": 0.26984908227570326, "rouge2_precision_stderr": 0.0026581113600308783, "rouge2_recall": 0.22503111854291266, "rouge2_recall_stderr": 0.0022246934402712576, "rougeL_fmeasure": 0.35285117962076507, "rougeL_fmeasure_stderr": 0.002056984438278394, "rougeL_precision": 0.40847493701179866, "rougeL_precision_stderr": 0.0029328594326759366, "rougeL_recall": 0.3432828614570734, "rougeL_recall_stderr": 0.0024768383332246274, "rougeLsum_fmeasure": 0.3979842693641427, "rougeLsum_fmeasure_stderr": 0.002298871731116783, "rougeLsum_precision": 0.4597128692889795, "rougeLsum_precision_stderr": 0.003192730444475765, "rougeLsum_recall": 0.38674562932968487, "rougeLsum_recall_stderr": 0.002735034124705561}}, "3": {"generate_text_restaurant": {"bleu": 14.533287312181702, "bleu_stderr": 0.18656720619703301, "rouge1_fmeasure": 0.478142190779417, "rouge1_fmeasure_stderr": 0.002235263328285793, "rouge1_precision": 0.5451265264934796, "rouge1_precision_stderr": 0.003147288201601649, "rouge1_recall": 0.4635452060391057, "rouge1_recall_stderr": 0.002820441604942112, "rouge2_fmeasure": 0.23634052735419925, "rouge2_fmeasure_stderr": 0.0020601993739155743, "rouge2_precision": 0.2714328734244194, "rouge2_precision_stderr": 0.002583914662030242, "rouge2_recall": 0.23027470724704344, "rouge2_recall_stderr": 0.0022949497645592134, "rougeL_fmeasure": 0.3585015271693414, "rougeL_fmeasure_stderr": 0.002048646341232902, "rougeL_precision": 0.40976835157500135, "rougeL_precision_stderr": 0.002821032707227765, "rougeL_recall": 0.3479477017249916, "rougeL_recall_stderr": 0.002470400553707357, "rougeLsum_fmeasure": 0.404637798062588, "rougeLsum_fmeasure_stderr": 0.002300343226701871, "rougeLsum_precision": 0.46121561084482127, "rougeLsum_precision_stderr": 0.003066679903899239, "rougeLsum_recall": 0.3924920651465138, "rougeLsum_recall_stderr": 0.0027314711241234945}}, "4": {"generate_text_restaurant": {"bleu": 14.788126595166384, "bleu_stderr": 0.18690040009176595, "rouge1_fmeasure": 0.48045922670165075, "rouge1_fmeasure_stderr": 0.002311569925954736, "rouge1_precision": 0.5482226243244983, "rouge1_precision_stderr": 0.0032124524377117303, "rouge1_recall": 0.46345359030928057, "rouge1_recall_stderr": 0.0028329342309959572, "rouge2_fmeasure": 0.23850438072833094, "rouge2_fmeasure_stderr": 0.002104880750018495, "rouge2_precision": 0.27446812632372075, "rouge2_precision_stderr": 0.0026299321970613995, "rouge2_recall": 0.2307927377768704, "rouge2_recall_stderr": 0.002303932783753952, "rougeL_fmeasure": 0.36091724958116495, "rougeL_fmeasure_stderr": 0.0021212744636126225, "rougeL_precision": 0.41268229797839345, "rougeL_precision_stderr": 0.002885772570892295, "rougeL_recall": 0.34835409226615954, "rougeL_recall_stderr": 0.0024733395312169725, "rougeLsum_fmeasure": 0.4083787027633123, "rougeLsum_fmeasure_stderr": 0.002378318389395946, "rougeLsum_precision": 0.46563464741217186, "rougeLsum_precision_stderr": 0.0031424166236039795, "rougeLsum_recall": 0.39431106312311454, "rougeLsum_recall_stderr": 0.0027657116569750795}}, "5": {"generate_text_restaurant": {"bleu": 14.816898114308659, "bleu_stderr": 0.2578943709075645, "rouge1_fmeasure": 0.4813258409710542, "rouge1_fmeasure_stderr": 0.0022155647522670184, "rouge1_precision": 0.5463965076188966, "rouge1_precision_stderr": 0.0031073173050291266, "rouge1_recall": 0.464786934938444, "rouge1_recall_stderr": 0.0027602961923400638, "rouge2_fmeasure": 0.23912604624839506, "rouge2_fmeasure_stderr": 0.002085017928004898, "rouge2_precision": 0.2741684381980198, "rouge2_precision_stderr": 0.002611976915331882, "rouge2_recall": 0.23130694014233213, "rouge2_recall_stderr": 0.002261839435649896, "rougeL_fmeasure": 0.3627614127879427, "rougeL_fmeasure_stderr": 0.002049692110741263, "rougeL_precision": 0.4127791623423059, "rougeL_precision_stderr": 0.0027864194329252517, "rougeL_recall": 0.35037713824014954, "rougeL_recall_stderr": 0.002413674972879658, "rougeLsum_fmeasure": 0.41096147149083767, "rougeLsum_fmeasure_stderr": 0.0022723564955560572, "rougeLsum_precision": 0.4665920267581706, "rougeLsum_precision_stderr": 0.0030279579394666637, "rougeLsum_recall": 0.3970310588904633, "rougeLsum_recall_stderr": 0.0026745772949146518}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.42436283894258, "bleu_stderr": 0.0809652794130641, "rouge1_fmeasure": 0.18486846991878378, "rouge1_fmeasure_stderr": 0.0026472370081744873, "rouge1_precision": 0.13515325881990387, "rouge1_precision_stderr": 0.001988006386809129, "rouge1_recall": 0.3082267363959881, "rouge1_recall_stderr": 0.004580722609646003, "rouge2_fmeasure": 0.03791043043165935, "rouge2_fmeasure_stderr": 0.0013771627135441805, "rouge2_precision": 0.0272859934174315, "rouge2_precision_stderr": 0.000998594442193996, "rouge2_recall": 0.06551213025624243, "rouge2_recall_stderr": 0.0024545590859311856, "rougeL_fmeasure": 0.13935422276606838, "rougeL_fmeasure_stderr": 0.0019388171493750941, "rougeL_precision": 0.10194380290336473, "rougeL_precision_stderr": 0.0014645033080314515, "rougeL_recall": 0.23262788209228538, "rougeL_recall_stderr": 0.003415563039987054, "rougeLsum_fmeasure": 0.14683463366193603, "rougeLsum_fmeasure_stderr": 0.0021884358384104898, "rougeLsum_precision": 0.10728605862144046, "rougeLsum_precision_stderr": 0.0016390380124941084, "rougeLsum_recall": 0.24560801323409917, "rougeLsum_recall_stderr": 0.0038465518145899083}}, "1": {"article_DOC_summary": {"bleu": 1.5072643038464826, "bleu_stderr": 0.07581189432866778, "rouge1_fmeasure": 0.19377928742866415, "rouge1_fmeasure_stderr": 0.0029318979888479386, "rouge1_precision": 0.18034557721867206, "rouge1_precision_stderr": 0.003378966024270665, "rouge1_recall": 0.25165661054563454, "rouge1_recall_stderr": 0.004219363793342944, "rouge2_fmeasure": 0.03391705069355955, "rouge2_fmeasure_stderr": 0.001600827686299807, "rouge2_precision": 0.03174129360340327, "rouge2_precision_stderr": 0.0016980099731496725, "rouge2_recall": 0.04525380270312792, "rouge2_recall_stderr": 0.0021325289082326157, "rougeL_fmeasure": 0.14433798493238226, "rougeL_fmeasure_stderr": 0.0022145818855320775, "rougeL_precision": 0.1348915392123015, "rougeL_precision_stderr": 0.002633227095382269, "rougeL_recall": 0.18737022577489543, "rougeL_recall_stderr": 0.0031416529379445714, "rougeLsum_fmeasure": 0.14964383560266273, "rougeLsum_fmeasure_stderr": 0.002331396812470961, "rougeLsum_precision": 0.1387175358944472, "rougeLsum_precision_stderr": 0.0026549430237383285, "rougeLsum_recall": 0.19658579721494118, "rougeLsum_recall_stderr": 0.003531925441008354}}, "2": {"article_DOC_summary": {"bleu": 2.135517530489009, "bleu_stderr": 0.09395977154683074, "rouge1_fmeasure": 0.21899687156092093, "rouge1_fmeasure_stderr": 0.0031785578491904183, "rouge1_precision": 0.22068687589610897, "rouge1_precision_stderr": 0.003789951247255284, "rouge1_recall": 0.25078858709670043, "rouge1_recall_stderr": 0.0039415090299140245, "rouge2_fmeasure": 0.04511500976844758, "rouge2_fmeasure_stderr": 0.001986723782931247, "rouge2_precision": 0.04612059160046461, "rouge2_precision_stderr": 0.00216246272019255, "rouge2_recall": 0.05136702902319479, "rouge2_recall_stderr": 0.002315418839001681, "rougeL_fmeasure": 0.16528451206309597, "rougeL_fmeasure_stderr": 0.002448179539908828, "rougeL_precision": 0.1666654493818604, "rougeL_precision_stderr": 0.002931741970841584, "rougeL_recall": 0.19015440175110393, "rougeL_recall_stderr": 0.0030722371403492073, "rougeLsum_fmeasure": 0.16744440121325288, "rougeLsum_fmeasure_stderr": 0.0024991880630578747, "rougeLsum_precision": 0.1684803863889924, "rougeLsum_precision_stderr": 0.0029515578981164007, "rougeLsum_recall": 0.19342460055959376, "rougeLsum_recall_stderr": 0.0032586706636470127}}, "3": {"article_DOC_summary": {"bleu": 2.306953746485347, "bleu_stderr": 0.08640815655966855, "rouge1_fmeasure": 0.21441285271309635, "rouge1_fmeasure_stderr": 0.0034688162829816867, "rouge1_precision": 0.21755397454942801, "rouge1_precision_stderr": 0.0040197643665198, "rouge1_recall": 0.24395196225361002, "rouge1_recall_stderr": 0.0041819563633223285, "rouge2_fmeasure": 0.0455324464868029, "rouge2_fmeasure_stderr": 0.0019857349620838816, "rouge2_precision": 0.04608502015312911, "rouge2_precision_stderr": 0.0021286532637906437, "rouge2_recall": 0.05204692726094356, "rouge2_recall_stderr": 0.002308596850566173, "rougeL_fmeasure": 0.159773564475991, "rougeL_fmeasure_stderr": 0.002683043459624172, "rougeL_precision": 0.16247030281934602, "rougeL_precision_stderr": 0.0031264236020145395, "rougeL_recall": 0.18178458588619653, "rougeL_recall_stderr": 0.0031938440097234347, "rougeLsum_fmeasure": 0.16297000957614646, "rougeLsum_fmeasure_stderr": 0.002742910817454915, "rougeLsum_precision": 0.16495789870371907, "rougeLsum_precision_stderr": 0.0031401120246905712, "rougeLsum_recall": 0.18683194228664585, "rougeLsum_recall_stderr": 0.0034187194500331993}}, "4": {"article_DOC_summary": {"bleu": 0.21661324901347828, "bleu_stderr": 0.07403289458610106, "rouge1_fmeasure": 0.05269608047169612, "rouge1_fmeasure_stderr": 0.003098859012482462, "rouge1_precision": 0.05735338761210453, "rouge1_precision_stderr": 0.0035071456355189675, "rouge1_recall": 0.059021496533637884, "rouge1_recall_stderr": 0.0036712685384006774, "rouge2_fmeasure": 0.010458434372266581, "rouge2_fmeasure_stderr": 0.0011178203055454239, "rouge2_precision": 0.010781643643676288, "rouge2_precision_stderr": 0.0011947055672951296, "rouge2_recall": 0.01223156205613527, "rouge2_recall_stderr": 0.0013826375879066437, "rougeL_fmeasure": 0.03966278292494188, "rougeL_fmeasure_stderr": 0.0023832058820488743, "rougeL_precision": 0.04368080562194709, "rougeL_precision_stderr": 0.002762386413099355, "rougeL_recall": 0.04437717860143795, "rougeL_recall_stderr": 0.0028397295075813945, "rougeLsum_fmeasure": 0.04062836833112145, "rougeLsum_fmeasure_stderr": 0.0024442501537786706, "rougeLsum_precision": 0.04449065152016272, "rougeLsum_precision_stderr": 0.002797505308089145, "rougeLsum_recall": 0.04583958323736499, "rougeLsum_recall_stderr": 0.0029805207884232624}}, "5": {"article_DOC_summary": {"bleu": 1.5974310709826073e-21, "bleu_stderr": 1.3100992922519676e-15, "rouge1_fmeasure": 0.0028901136641309804, "rouge1_fmeasure_stderr": 0.0007628453825537149, "rouge1_precision": 0.003544629362742575, "rouge1_precision_stderr": 0.0009485977995739016, "rouge1_recall": 0.0037550498645595996, "rouge1_recall_stderr": 0.0011352337360716696, "rouge2_fmeasure": 0.0007826161154573944, "rouge2_fmeasure_stderr": 0.0003313480806525455, "rouge2_precision": 0.0006426177000561377, "rouge2_precision_stderr": 0.00026640442181651204, "rouge2_recall": 0.0013262549819153596, "rouge2_recall_stderr": 0.0006559432301667889, "rougeL_fmeasure": 0.002389689764554573, "rougeL_fmeasure_stderr": 0.0006340237380334276, "rougeL_precision": 0.002882853322968734, "rougeL_precision_stderr": 0.0007611952222133634, "rougeL_recall": 0.003139944055164728, "rougeL_recall_stderr": 0.0009749927515076235, "rougeLsum_fmeasure": 0.00241629675514098, "rougeLsum_fmeasure_stderr": 0.0006494890213851666, "rougeLsum_precision": 0.002902333778437242, "rougeLsum_precision_stderr": 0.0007672460026879215, "rougeLsum_recall": 0.0031769694879492582, "rougeLsum_recall_stderr": 0.0010086213776490023}}}}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..899a67f6e48e1566509caec6f138cbf3da616f0a
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.25518818499117035,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.020198929979072126
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.06424858701254539,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002540662604602708
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.24194520608398293,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005061888078063388
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.08713006240052369,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021163575152811052
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.029250897601530847,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016159150314360084
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.11213489324648325,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0031431788617556264
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.03968682582435309,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012864152483850072
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.061593275111588554,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0024488839830169046
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2334118422350547,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0048787714459462665
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.08351980735691193,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001978263321627184
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.06070440310915625,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0024423712896334553
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.22684155273254836,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0046468714919551185
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.08198324173226955,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019772453103279655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8afd18884ccbf3cd9859773d9bf4677059b90559
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.37770333810014706,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.036043662376688344
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.08688444048786848,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0030316834829975468
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.25231836055300055,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004707834740663268
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.11029589045724046,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026302378640336893
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.039888024685296904,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0019445292240548378
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.12320682798737367,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0031690473734229127
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.050696350493553845,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001601172935799986
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.08067935525490526,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028191568459744356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2400568623297162,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004443711432592927
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.10289898300711169,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002349648102435476
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.081852061098128,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0028519921028834602
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.24140934346009968,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0044596632225301245
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.10416303503161863,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024045282990398016
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0faab72aba03c845e002ef24656cf188f7acc13
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.3829018871876502,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.022682302500048476
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.09533308838612936,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031918959249739934
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.275555629679834,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004727965781775394
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.12236842718754035,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0030225942233901805
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.04438815526628443,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018839731929603098
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.13438684673727228,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0032179061201762227
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.05746066096268111,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0018708723435892534
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.08478065899787014,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0026474348060842394
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.25951028904964774,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004388247219710807
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.11131192366136156,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0025600729892227623
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.0870012622715152,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002779405775352401
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.2612564683466686,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004413760869764198
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.11327636810594607,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00265570207245623
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..733c35e748c07149884abf8b8509dcc1b10a7742
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.49438795846003614,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03220480694116766
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.11102978394749591,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037258981840235512
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.29324047704391387,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004929158066657188
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.13661508516896614,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0033865188061428593
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.05489349981744968,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002411444050666127
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1460318403788494,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0034080826963901003
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.06675560716724324,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002152759177258534
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.0983510025951489,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031311578361653356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.27396911503915883,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004527803475867941
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.12331965784710698,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0028547373168783547
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.10118232307247621,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032732569349238874
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.2768278339741749,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004591647175413129
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1258905544050262,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0029667713047718076
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b1ee2893eacf1b51b9e83f9079eee79304017dd
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5097531919668136,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.037846557427194026
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.12660445313737514,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004069129304355684
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.30584134713679634,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004938665654397721
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.15051290928420963,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0035962245168251086
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.06380697763757867,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025616190740023713
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1531395402953802,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0033953087678648087
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07435510796633933,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002254816031736983
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.11139297723344309,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003440135146254942
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.28386785189457364,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00447862149321582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.13483336166859466,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0030388433299652576
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.11506907251192842,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036096573854085593
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.287989283624223,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004567845010036214
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.13843888381409075,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0031866033556944046
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..24fb9e869f68c2529fc19ba9c81d51fefff438a7
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5493345438208933,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03156507075706306
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.13145167799279456,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004208017486529403
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.31431356632933516,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004912309978055554
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.15523792842972664,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003705043227083823
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.06682039356587426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026734322118555824
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1593483911606193,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003488245506035694
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07729991696510335,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002362198278661277
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.11521584250075316,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0035483893462582828
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2911824630349328,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00449934920077821
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.13867709585369095,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0031268375342873036
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.11975844765120187,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037510579951047463
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.29608094482109,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00457219328093606
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1428199679819932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0032823766236227112
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fead82bca5365812570f1a4223a4e01966afd9b8
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.15983607856511065,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0025552324547594915
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.22751168503589744,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030756037547284122
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.17176517291666785,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002329017824094656
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.03630672989355793,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010196024456578525
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.052475014894037275,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001428315830366114
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03932636998031157,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009987340137274396
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.12519971826411405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0019474345957730163
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1828331784278453,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002460695997235263
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.13522236510382263,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001715199895595129
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.1479787200014008,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0024084994322680275
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.21009537033174597,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028681840700951855
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.15863624899200707,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021698433809270567
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.2908341082810773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08591697554573632
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..71a202cf9f6c2e23bb5470a14503427d59bc2754
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2074831448217252,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003524413251335538
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1603567541864711,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002358874310291858
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.1500600406332044,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0019241989868486221
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.048193206661683564,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0020379873823770994
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.030329812370434826,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011532237790344163
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.02928962923956642,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.000993776980137765
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.16593034599476272,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002952779433549126
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1263395938894177,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018438888158069336
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.11776814806946785,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014763519012599205
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.19732631307428722,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0034023899501457837
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.15136030413475798,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0021926321158690864
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.14183665338988743,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001800224756397767
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.8024864455011091,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09003925624860501
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5a422ddef163dd0bf03c24a9ef7b70f3fffbe94
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.28596147697474456,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0040250344618331995
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1993639467154353,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002664319477833243
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.1947292213306491,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021918313174583916
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08090209451722337,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024314880977671276
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.05022891340202324,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013490243270254702
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.049895782057071895,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012048654152258728
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.22915557741483644,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0033926337363906784
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15804196664911024,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002108622273540943
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.15366314427734978,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016931718188903863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2703085422322766,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003864247203591127
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.18781780308286916,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025166845226557905
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18325876600451932,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020600409806209663
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.9479127015875357,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07796616677370145
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd15f737ad78402fc7ae994d0c319a85a3dfd4cd
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.26271995584299296,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004350571081047754
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.17195910512668672,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028930003693202434
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.17047407122919256,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024381191437714608
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.07716157594080365,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025215008865493967
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04592292328627891,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014011960521198414
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.04580407896668463,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001240136840680315
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.2129040576929948,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036832280907527527
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.13748507108736507,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023122364311355848
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1358429496475672,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019120675392480628
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.24858343528470517,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0041894189677485535
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.1614787885865035,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027151405967413447
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.16017361729618249,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022880671169300927
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.368296526557304,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06715896563436964
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..48cec1ae8894e63fec45fd9a03f72c502a5ac575
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.09226808587029187,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0035951645035662144
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.05378214914394185,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0021922619377159835
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.05527677687040673,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002075334915597304
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.028532578970617953,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018587749480454441
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.01548143136364689,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010135388288224974
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.015865483046525946,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009374894411648104
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07725545552877301,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031119081702266804
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.04393223408477368,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018056545763819988
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.04511311086771814,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016959499204821453
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.08723167275286887,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003438050373646056
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.05010740359653005,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0020446842446827317
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.05162745766582887,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019361443382161162
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.03702293319391888,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.008280750551799155
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..25a16d859abdd87d41b0b77f99ec6495bb4327e7
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.01624748312843043,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0017343102214544073
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.008973352017838115,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0009874748509305067
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.008715725437140403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008716993431018379
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.005494197830878659,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.000991214742335091
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.002558561959956041,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003855003265324329
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.002496646429114283,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00036357062963770885
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.014045751930703699,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015619179482028879
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.0075965284642917335,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.000846021103064854
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.0072939865630402565,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007275988588633965
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.015553958884576207,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001678423693533492
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.008515152310636652,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0009413374007224053
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.008294111648957665,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0008329997418165786
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 6.329549235648946e-15,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 6.98643056969485e-14
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8fe31219456c71826bcb7efde3640348f94db2c4
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 3.4385508172058055,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.04264456511068813
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.3745168009518315,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0054770341977520625
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.29958589610084396,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033801849814915222
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.2767986999806955,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0032477671776355997
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.20297808777795995,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.005522822982953256
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.10216853633525805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0019971383737075166
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.097088577058388,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0017064511604233982
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.29931908976323446,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.005159547193434527
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.2284780514712017,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023634097297777477
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.20603167711573117,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002072804178131474
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.34463645738569776,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005423162557411763
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.2657387698192473,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003165191193253911
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.2467760500253612,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00298962192866607
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..71694612287b1ba8742d8900b6a2c13d140c75c1
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 11.578509877350227,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10981566633879294
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5290497807915124,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0034031031148912877
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.42449982582372703,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029796948529652516
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4430699483900025,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024212750351764543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.24509073269483966,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026669698385618284
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.19440129139428716,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021684401687693156
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2028107162678877,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020310061478079093
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.38881065801490833,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029605217311991657
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.31090092383731366,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002452100105085375
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.324119071682265,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020761088086072486
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.43465610390683307,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032381620072339475
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.348028905687337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00275713194366831
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.36322533452022043,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002355969259137101
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0aa21797d0bdce7a245a7acfb74eefcb0c0f7994
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 13.973742807059493,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.21933451006645593
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5463581275257614,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033109746053900106
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4599780026221357,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002866451132503217
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4734319432946387,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002277730687363045
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.26984908227570326,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026581113600308783
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.22503111854291266,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022246934402712576
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.23112076704320592,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002029026468778622
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.40847493701179866,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029328594326759366
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3432828614570734,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024768383332246274
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.35285117962076507,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002056984438278394
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4597128692889795,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003192730444475765
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.38674562932968487,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002735034124705561
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.3979842693641427,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002298871731116783
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f329a888a64b20389cdc9a64a333233f30f9a5db
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.533287312181702,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.18656720619703301
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5451265264934796,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003147288201601649
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4635452060391057,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002820441604942112
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.478142190779417,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002235263328285793
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2714328734244194,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002583914662030242
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23027470724704344,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022949497645592134
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.23634052735419925,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020601993739155743
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.40976835157500135,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002821032707227765
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3479477017249916,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002470400553707357
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3585015271693414,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002048646341232902
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.46121561084482127,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003066679903899239
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3924920651465138,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027314711241234945
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.404637798062588,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002300343226701871
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e974a06875bff953f656d2b43f738cb831ea77f5
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.788126595166384,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.18690040009176595
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5482226243244983,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032124524377117303
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.46345359030928057,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028329342309959572
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.48045922670165075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002311569925954736
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.27446812632372075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026299321970613995
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2307927377768704,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002303932783753952
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.23850438072833094,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002104880750018495
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.41268229797839345,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002885772570892295
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34835409226615954,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024733395312169725
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36091724958116495,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021212744636126225
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.46563464741217186,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031424166236039795
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39431106312311454,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027657116569750795
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4083787027633123,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002378318389395946
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..35669a61e5dcbda114beee81bf391da1cab44e6e
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.816898114308659,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2578943709075645
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5463965076188966,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031073173050291266
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.464786934938444,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027602961923400638
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4813258409710542,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022155647522670184
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2741684381980198,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002611976915331882
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23130694014233213,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002261839435649896
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.23912604624839506,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002085017928004898
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4127791623423059,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027864194329252517
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35037713824014954,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002413674972879658
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3627614127879427,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002049692110741263
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4665920267581706,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030279579394666637
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3970310588904633,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026745772949146518
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.41096147149083767,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022723564955560572
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bce7e6da54e0e349d63ba5520e026765dc5f3c0
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.13515325881990387,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001988006386809129
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.3082267363959881,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004580722609646003
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.18486846991878378,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0026472370081744873
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0272859934174315,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.000998594442193996
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06551213025624243,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024545590859311856
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.03791043043165935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013771627135441805
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.10194380290336473,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0014645033080314515
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.23262788209228538,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003415563039987054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.13935422276606838,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0019388171493750941
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.10728605862144046,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0016390380124941084
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.24560801323409917,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0038465518145899083
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.14683463366193603,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0021884358384104898
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.42436283894258,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0809652794130641
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..99f08ebb856784932f6b2731b5b50690578d72d5
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.18034557721867206,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003378966024270665
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.25165661054563454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004219363793342944
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.19377928742866415,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029318979888479386
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.03174129360340327,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016980099731496725
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.04525380270312792,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0021325289082326157
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.03391705069355955,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001600827686299807
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1348915392123015,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002633227095382269
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18737022577489543,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031416529379445714
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.14433798493238226,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0022145818855320775
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1387175358944472,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0026549430237383285
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.19658579721494118,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003531925441008354
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.14964383560266273,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002331396812470961
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.5072643038464826,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07581189432866778
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c60b9194a317b4913205810ab78c9e902fbf116d
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.22068687589610897,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003789951247255284
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.25078858709670043,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0039415090299140245
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21899687156092093,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0031785578491904183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04612059160046461,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00216246272019255
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05136702902319479,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002315418839001681
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04511500976844758,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001986723782931247
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1666654493818604,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002931741970841584
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.19015440175110393,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030722371403492073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16528451206309597,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002448179539908828
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1684803863889924,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0029515578981164007
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.19342460055959376,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032586706636470127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16744440121325288,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024991880630578747
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.135517530489009,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09395977154683074
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f82ebb590125edc0cc7a1ab75f12a91f5b7bcb8e
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.21755397454942801,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0040197643665198
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.24395196225361002,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0041819563633223285
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21441285271309635,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034688162829816867
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04608502015312911,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0021286532637906437
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05204692726094356,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002308596850566173
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0455324464868029,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019857349620838816
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.16247030281934602,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0031264236020145395
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18178458588619653,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031938440097234347
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.159773564475991,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002683043459624172
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16495789870371907,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0031401120246905712
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.18683194228664585,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0034187194500331993
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16297000957614646,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002742910817454915
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.306953746485347,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08640815655966855
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a0d18a5608752f9037eeebf2b2492affbad2c6a
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.05735338761210453,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0035071456355189675
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.059021496533637884,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0036712685384006774
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05269608047169612,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003098859012482462
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.010781643643676288,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011947055672951296
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.01223156205613527,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0013826375879066437
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.010458434372266581,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011178203055454239
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.04368080562194709,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002762386413099355
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.04437717860143795,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028397295075813945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.03966278292494188,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0023832058820488743
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.04449065152016272,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002797505308089145
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.04583958323736499,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0029805207884232624
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04062836833112145,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024442501537786706
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.21661324901347828,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07403289458610106
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9539330998c2c3b9435add83fb2063c78aff9d5
--- /dev/null
+++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.003544629362742575,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0009485977995739016
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0037550498645595996,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0011352337360716696
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0028901136641309804,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0007628453825537149
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0006426177000561377,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00026640442181651204
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0013262549819153596,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0006559432301667889
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0007826161154573944,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0003313480806525455
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.002882853322968734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0007611952222133634
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.003139944055164728,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0009749927515076235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.002389689764554573,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0006340237380334276
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.002902333778437242,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0007672460026879215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0031769694879492582,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0010086213776490023
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.00241629675514098,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0006494890213851666
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.5974310709826073e-21,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 1.3100992922519676e-15
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 4,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_0.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_0.csv
new file mode 100644
index 0000000000000000000000000000000000000000..40eeffb69152a5662d6fa8201f6078c63efd9fa9
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_0.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.335,0.01493311749093257,0
+anli_r2,acc,0.336,0.014944140233795023,0
+anli_r3,acc,0.3575,0.013840921245257796,0
+arc_challenge,acc,0.23378839590443687,0.012368225378507142,0
+arc_challenge,acc_norm,0.2525597269624573,0.012696728980207704,0
+arc_easy,acc,0.5353535353535354,0.010234104543411433,0
+arc_easy,acc_norm,0.48484848484848486,0.010255071794531502,0
+boolq,acc,0.6039755351681957,0.008553881336813417,1
+cb,acc,0.39285714285714285,0.0658538889806635,1
+cb,f1,0.2829428067523306,,1
+copa,acc,0.67,0.04725815626252607,0
+hellaswag,acc,0.35012945628360886,0.004760354191370866,0
+hellaswag,acc_norm,0.41983668591913964,0.004925233680511587,0
+piqa,acc,0.6871599564744287,0.010817714425701107,0
+piqa,acc_norm,0.6920565832426551,0.010770892367463671,0
+rte,acc,0.44765342960288806,0.02993107036293953,0
+sciq,acc,0.805,0.012535235623319327,0
+sciq,acc_norm,0.722,0.014174516461485254,0
+storycloze_2016,acc,0.6312132549438803,0.011157191913955236,0
+winogrande,acc,0.5019731649565904,0.014052376259225632,0
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_0.json b/evaluation/rankeval/checkpoints_2b855boscartasky_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b47b7895b76906580cad7f0fd71b4f44191b3b04
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_0.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.335,
+            "acc_stderr": 0.01493311749093257
+        },
+        "anli_r2": {
+            "acc": 0.336,
+            "acc_stderr": 0.014944140233795023
+        },
+        "anli_r3": {
+            "acc": 0.3575,
+            "acc_stderr": 0.013840921245257796
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.2829428067523306
+        },
+        "copa": {
+            "acc": 0.67,
+            "acc_stderr": 0.04725815626252607
+        },
+        "hellaswag": {
+            "acc": 0.35012945628360886,
+            "acc_stderr": 0.004760354191370866,
+            "acc_norm": 0.41983668591913964,
+            "acc_norm_stderr": 0.004925233680511587
+        },
+        "rte": {
+            "acc": 0.44765342960288806,
+            "acc_stderr": 0.02993107036293953
+        },
+        "winogrande": {
+            "acc": 0.5019731649565904,
+            "acc_stderr": 0.014052376259225632
+        },
+        "storycloze_2016": {
+            "acc": 0.6312132549438803,
+            "acc_stderr": 0.011157191913955236
+        },
+        "boolq": {
+            "acc": 0.6039755351681957,
+            "acc_stderr": 0.008553881336813417
+        },
+        "arc_easy": {
+            "acc": 0.5353535353535354,
+            "acc_stderr": 0.010234104543411433,
+            "acc_norm": 0.48484848484848486,
+            "acc_norm_stderr": 0.010255071794531502
+        },
+        "arc_challenge": {
+            "acc": 0.23378839590443687,
+            "acc_stderr": 0.012368225378507142,
+            "acc_norm": 0.2525597269624573,
+            "acc_norm_stderr": 0.012696728980207704
+        },
+        "sciq": {
+            "acc": 0.805,
+            "acc_stderr": 0.012535235623319327,
+            "acc_norm": 0.722,
+            "acc_norm_stderr": 0.014174516461485254
+        },
+        "piqa": {
+            "acc": 0.6871599564744287,
+            "acc_stderr": 0.010817714425701107,
+            "acc_norm": 0.6920565832426551,
+            "acc_norm_stderr": 0.010770892367463671
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_1.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_1.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3ed25f23cf18368759ce53a0fc8891c7b1d098cf
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_1.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.347,0.015060472031706624,0
+anli_r2,acc,0.325,0.014818724459095527,0
+anli_r3,acc,0.36083333333333334,0.013869180252444864,0
+arc_challenge,acc,0.257679180887372,0.012780770562768405,0
+arc_challenge,acc_norm,0.28071672354948807,0.013131238126975574,0
+arc_easy,acc,0.5542929292929293,0.01019911818332299,0
+arc_easy,acc_norm,0.5223063973063973,0.010249568404555652,0
+boolq,acc,0.5929663608562691,0.008592562887068871,1
+cb,acc,0.4642857142857143,0.06724777654937658,1
+cb,f1,0.2842465753424657,,1
+copa,acc,0.62,0.04878317312145632,0
+hellaswag,acc,0.3514240191196973,0.0047643939851110305,0
+hellaswag,acc_norm,0.42859988050189207,0.004938643787869535,0
+piqa,acc,0.6871599564744287,0.010817714425701104,0
+piqa,acc_norm,0.6828073993471164,0.01085815545438087,0
+rte,acc,0.4548736462093863,0.029973636495415252,0
+sciq,acc,0.873,0.010534798620855743,0
+sciq,acc_norm,0.854,0.011171786285496497,0
+storycloze_2016,acc,0.6264029930518439,0.011186849693644694,0
+winogrande,acc,0.5122336227308603,0.01404827882040562,0
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_1.json b/evaluation/rankeval/checkpoints_2b855boscartasky_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1a210921c98e3d62bd6c75bfed0c47aec42e6de
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_1.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.347,
+            "acc_stderr": 0.015060472031706624
+        },
+        "anli_r2": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095527
+        },
+        "anli_r3": {
+            "acc": 0.36083333333333334,
+            "acc_stderr": 0.013869180252444864
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.2842465753424657
+        },
+        "copa": {
+            "acc": 0.62,
+            "acc_stderr": 0.04878317312145632
+        },
+        "hellaswag": {
+            "acc": 0.3514240191196973,
+            "acc_stderr": 0.0047643939851110305,
+            "acc_norm": 0.42859988050189207,
+            "acc_norm_stderr": 0.004938643787869535
+        },
+        "rte": {
+            "acc": 0.4548736462093863,
+            "acc_stderr": 0.029973636495415252
+        },
+        "winogrande": {
+            "acc": 0.5122336227308603,
+            "acc_stderr": 0.01404827882040562
+        },
+        "storycloze_2016": {
+            "acc": 0.6264029930518439,
+            "acc_stderr": 0.011186849693644694
+        },
+        "boolq": {
+            "acc": 0.5929663608562691,
+            "acc_stderr": 0.008592562887068871
+        },
+        "arc_easy": {
+            "acc": 0.5542929292929293,
+            "acc_stderr": 0.01019911818332299,
+            "acc_norm": 0.5223063973063973,
+            "acc_norm_stderr": 0.010249568404555652
+        },
+        "arc_challenge": {
+            "acc": 0.257679180887372,
+            "acc_stderr": 0.012780770562768405,
+            "acc_norm": 0.28071672354948807,
+            "acc_norm_stderr": 0.013131238126975574
+        },
+        "sciq": {
+            "acc": 0.873,
+            "acc_stderr": 0.010534798620855743,
+            "acc_norm": 0.854,
+            "acc_norm_stderr": 0.011171786285496497
+        },
+        "piqa": {
+            "acc": 0.6871599564744287,
+            "acc_stderr": 0.010817714425701104,
+            "acc_norm": 0.6828073993471164,
+            "acc_norm_stderr": 0.01085815545438087
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_2.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_2.csv
new file mode 100644
index 0000000000000000000000000000000000000000..67be0eac6a99dfb2c7ba7b865a5e91df29765951
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_2.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.327,0.014842213153411245,0
+anli_r2,acc,0.338,0.01496596071022449,0
+anli_r3,acc,0.3358333333333333,0.01363926119093289,0
+arc_challenge,acc,0.25,0.012653835621466646,0
+arc_challenge,acc_norm,0.28071672354948807,0.013131238126975576,0
+arc_easy,acc,0.5547138047138047,0.010198171137873871,0
+arc_easy,acc_norm,0.5568181818181818,0.010193324837773498,0
+boolq,acc,0.590519877675841,0.008600549751320923,1
+cb,acc,0.48214285714285715,0.0673769750864465,1
+cb,f1,0.28584474885844746,,1
+copa,acc,0.66,0.04760952285695237,0
+hellaswag,acc,0.35152360087631945,0.004764703145680279,0
+hellaswag,acc_norm,0.42620991834295957,0.004935143791573811,0
+piqa,acc,0.690424374319913,0.010786656752183345,0
+piqa,acc_norm,0.6849836779107725,0.010838072746240653,0
+rte,acc,0.47653429602888087,0.030063300411902652,0
+sciq,acc,0.88,0.010281328012747398,0
+sciq,acc_norm,0.873,0.010534798620855759,0
+storycloze_2016,acc,0.6264029930518439,0.011186849693644694,0
+winogrande,acc,0.5138121546961326,0.014047122916440419,0
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_2.json b/evaluation/rankeval/checkpoints_2b855boscartasky_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..467a181cfa26bc618cde2a7242a55f0e7b9a2331
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_2.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.327,
+            "acc_stderr": 0.014842213153411245
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.01496596071022449
+        },
+        "anli_r3": {
+            "acc": 0.3358333333333333,
+            "acc_stderr": 0.01363926119093289
+        },
+        "cb": {
+            "acc": 0.48214285714285715,
+            "acc_stderr": 0.0673769750864465,
+            "f1": 0.28584474885844746
+        },
+        "copa": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695237
+        },
+        "hellaswag": {
+            "acc": 0.35152360087631945,
+            "acc_stderr": 0.004764703145680279,
+            "acc_norm": 0.42620991834295957,
+            "acc_norm_stderr": 0.004935143791573811
+        },
+        "rte": {
+            "acc": 0.47653429602888087,
+            "acc_stderr": 0.030063300411902652
+        },
+        "winogrande": {
+            "acc": 0.5138121546961326,
+            "acc_stderr": 0.014047122916440419
+        },
+        "storycloze_2016": {
+            "acc": 0.6264029930518439,
+            "acc_stderr": 0.011186849693644694
+        },
+        "boolq": {
+            "acc": 0.590519877675841,
+            "acc_stderr": 0.008600549751320923
+        },
+        "arc_easy": {
+            "acc": 0.5547138047138047,
+            "acc_stderr": 0.010198171137873871,
+            "acc_norm": 0.5568181818181818,
+            "acc_norm_stderr": 0.010193324837773498
+        },
+        "arc_challenge": {
+            "acc": 0.25,
+            "acc_stderr": 0.012653835621466646,
+            "acc_norm": 0.28071672354948807,
+            "acc_norm_stderr": 0.013131238126975576
+        },
+        "sciq": {
+            "acc": 0.88,
+            "acc_stderr": 0.010281328012747398,
+            "acc_norm": 0.873,
+            "acc_norm_stderr": 0.010534798620855759
+        },
+        "piqa": {
+            "acc": 0.690424374319913,
+            "acc_stderr": 0.010786656752183345,
+            "acc_norm": 0.6849836779107725,
+            "acc_norm_stderr": 0.010838072746240653
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_3.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_3.csv
new file mode 100644
index 0000000000000000000000000000000000000000..090da2ebd2424447fb4d12ae6338c960d2d2343e
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_3.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.318,0.014734079309311901,0
+anli_r2,acc,0.365,0.015231776226264902,0
+anli_r3,acc,0.3491666666666667,0.013767075395077252,0
+arc_challenge,acc,0.25341296928327645,0.012710896778378606,0
+arc_challenge,acc_norm,0.2815699658703072,0.013143376735009026,0
+arc_easy,acc,0.5618686868686869,0.010180937100600071,0
+arc_easy,acc_norm,0.5568181818181818,0.010193324837773495,0
+boolq,acc,0.5801223241590214,0.00863204550478174,1
+cb,acc,0.44642857142857145,0.06703189227942398,1
+cb,f1,0.2316017316017316,,1
+copa,acc,0.69,0.04648231987117316,0
+hellaswag,acc,0.3505277833100976,0.00476160130325889,0
+hellaswag,acc_norm,0.4260107548297152,0.004934846809827193,0
+piqa,acc,0.6931447225244831,0.010760295070580368,0
+piqa,acc_norm,0.6877040261153428,0.010812581599154424,0
+rte,acc,0.4620938628158845,0.030009848912529113,0
+sciq,acc,0.882,0.010206869264381795,0
+sciq,acc_norm,0.875,0.010463483381956722,0
+storycloze_2016,acc,0.6226616782469268,0.011209099452196189,0
+winogrande,acc,0.4964483030781373,0.014052131146915864,0
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_3.json b/evaluation/rankeval/checkpoints_2b855boscartasky_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c66c83c3f51d8b06460c86c21fb35de825e0fbf
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_3.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.318,
+            "acc_stderr": 0.014734079309311901
+        },
+        "anli_r2": {
+            "acc": 0.365,
+            "acc_stderr": 0.015231776226264902
+        },
+        "anli_r3": {
+            "acc": 0.3491666666666667,
+            "acc_stderr": 0.013767075395077252
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942398,
+            "f1": 0.2316017316017316
+        },
+        "copa": {
+            "acc": 0.69,
+            "acc_stderr": 0.04648231987117316
+        },
+        "hellaswag": {
+            "acc": 0.3505277833100976,
+            "acc_stderr": 0.00476160130325889,
+            "acc_norm": 0.4260107548297152,
+            "acc_norm_stderr": 0.004934846809827193
+        },
+        "rte": {
+            "acc": 0.4620938628158845,
+            "acc_stderr": 0.030009848912529113
+        },
+        "winogrande": {
+            "acc": 0.4964483030781373,
+            "acc_stderr": 0.014052131146915864
+        },
+        "storycloze_2016": {
+            "acc": 0.6226616782469268,
+            "acc_stderr": 0.011209099452196189
+        },
+        "boolq": {
+            "acc": 0.5801223241590214,
+            "acc_stderr": 0.00863204550478174
+        },
+        "arc_easy": {
+            "acc": 0.5618686868686869,
+            "acc_stderr": 0.010180937100600071,
+            "acc_norm": 0.5568181818181818,
+            "acc_norm_stderr": 0.010193324837773495
+        },
+        "arc_challenge": {
+            "acc": 0.25341296928327645,
+            "acc_stderr": 0.012710896778378606,
+            "acc_norm": 0.2815699658703072,
+            "acc_norm_stderr": 0.013143376735009026
+        },
+        "sciq": {
+            "acc": 0.882,
+            "acc_stderr": 0.010206869264381795,
+            "acc_norm": 0.875,
+            "acc_norm_stderr": 0.010463483381956722
+        },
+        "piqa": {
+            "acc": 0.6931447225244831,
+            "acc_stderr": 0.010760295070580368,
+            "acc_norm": 0.6877040261153428,
+            "acc_norm_stderr": 0.010812581599154424
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_4.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_4.csv
new file mode 100644
index 0000000000000000000000000000000000000000..bee31270e66bbbf6f45acb6e0bed5e37cc5dec96
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_4.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.341,0.014998131348402709,0
+anli_r2,acc,0.348,0.01507060460376841,0
+anli_r3,acc,0.3541666666666667,0.013811933499570958,0
+arc_challenge,acc,0.24232081911262798,0.012521593295800116,0
+arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0
+arc_easy,acc,0.5648148148148148,0.01017321643037092,0
+arc_easy,acc_norm,0.5648148148148148,0.010173216430370913,0
+boolq,acc,0.5899082568807339,0.008602512053254421,1
+cb,acc,0.42857142857142855,0.06672848092813058,1
+cb,f1,0.19999999999999998,,1
+copa,acc,0.67,0.04725815626252607,0
+hellaswag,acc,0.3505277833100976,0.004761601303258889,0
+hellaswag,acc_norm,0.4282015534754033,0.0049380686273494834,0
+piqa,acc,0.6855277475516867,0.010833009065106572,0
+piqa,acc_norm,0.6784548422198041,0.010897500107575652,0
+rte,acc,0.4584837545126354,0.029992535385373314,0
+sciq,acc,0.891,0.00985982840703719,0
+sciq,acc_norm,0.884,0.010131468138756995,0
+storycloze_2016,acc,0.6231961517904864,0.011205964516279667,0
+winogrande,acc,0.5090765588003157,0.014050170094497704,0
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_4.json b/evaluation/rankeval/checkpoints_2b855boscartasky_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7109ff09ed8334f11900f87f6b6a65a3102f3a7
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_4.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.341,
+            "acc_stderr": 0.014998131348402709
+        },
+        "anli_r2": {
+            "acc": 0.348,
+            "acc_stderr": 0.01507060460376841
+        },
+        "anli_r3": {
+            "acc": 0.3541666666666667,
+            "acc_stderr": 0.013811933499570958
+        },
+        "cb": {
+            "acc": 0.42857142857142855,
+            "acc_stderr": 0.06672848092813058,
+            "f1": 0.19999999999999998
+        },
+        "copa": {
+            "acc": 0.67,
+            "acc_stderr": 0.04725815626252607
+        },
+        "hellaswag": {
+            "acc": 0.3505277833100976,
+            "acc_stderr": 0.004761601303258889,
+            "acc_norm": 0.4282015534754033,
+            "acc_norm_stderr": 0.0049380686273494834
+        },
+        "rte": {
+            "acc": 0.4584837545126354,
+            "acc_stderr": 0.029992535385373314
+        },
+        "winogrande": {
+            "acc": 0.5090765588003157,
+            "acc_stderr": 0.014050170094497704
+        },
+        "storycloze_2016": {
+            "acc": 0.6231961517904864,
+            "acc_stderr": 0.011205964516279667
+        },
+        "boolq": {
+            "acc": 0.5899082568807339,
+            "acc_stderr": 0.008602512053254421
+        },
+        "arc_easy": {
+            "acc": 0.5648148148148148,
+            "acc_stderr": 0.01017321643037092,
+            "acc_norm": 0.5648148148148148,
+            "acc_norm_stderr": 0.010173216430370913
+        },
+        "arc_challenge": {
+            "acc": 0.24232081911262798,
+            "acc_stderr": 0.012521593295800116,
+            "acc_norm": 0.27559726962457337,
+            "acc_norm_stderr": 0.013057169655761838
+        },
+        "sciq": {
+            "acc": 0.891,
+            "acc_stderr": 0.00985982840703719,
+            "acc_norm": 0.884,
+            "acc_norm_stderr": 0.010131468138756995
+        },
+        "piqa": {
+            "acc": 0.6855277475516867,
+            "acc_stderr": 0.010833009065106572,
+            "acc_norm": 0.6784548422198041,
+            "acc_norm_stderr": 0.010897500107575652
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_5.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_5.csv
new file mode 100644
index 0000000000000000000000000000000000000000..8f7b1f190f1ce29512c9ffbd8b9e4a25b2a8a8bc
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_5.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.356,0.015149042659306623,0
+anli_r2,acc,0.355,0.015139491543780532,0
+anli_r3,acc,0.3433333333333333,0.01371263383046586,0
+arc_challenge,acc,0.257679180887372,0.012780770562768395,0
+arc_challenge,acc_norm,0.2781569965870307,0.013094469919538812,0
+arc_easy,acc,0.5618686868686869,0.010180937100600073,0
+arc_easy,acc_norm,0.5589225589225589,0.01018829322104055,0
+boolq,acc,0.5770642201834862,0.008640558744656428,1
+cb,acc,0.5535714285714286,0.06703189227942394,1
+cb,f1,0.3074074074074074,,1
+copa,acc,0.67,0.04725815626252607,0
+hellaswag,acc,0.3536148177653854,0.004771143074426132,0
+hellaswag,acc_norm,0.4255128460466043,0.00493410077448122,0
+piqa,acc,0.6838955386289445,0.010848148455700455,0
+piqa,acc_norm,0.6789989118607181,0.010892641574707904,0
+rte,acc,0.48014440433212996,0.0300727231673172,0
+sciq,acc,0.897,0.0096168333396958,0
+sciq,acc_norm,0.893,0.009779910359847169,0
+storycloze_2016,acc,0.6264029930518439,0.011186849693644694,0
+winogrande,acc,0.500394632991318,0.014052481306049516,0
diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_5.json b/evaluation/rankeval/checkpoints_2b855boscartasky_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..82f5ed5fc0bcca786f95cdec00c4f924e2bea967
--- /dev/null
+++ b/evaluation/rankeval/checkpoints_2b855boscartasky_5.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.356,
+            "acc_stderr": 0.015149042659306623
+        },
+        "anli_r2": {
+            "acc": 0.355,
+            "acc_stderr": 0.015139491543780532
+        },
+        "anli_r3": {
+            "acc": 0.3433333333333333,
+            "acc_stderr": 0.01371263383046586
+        },
+        "cb": {
+            "acc": 0.5535714285714286,
+            "acc_stderr": 0.06703189227942394,
+            "f1": 0.3074074074074074
+        },
+        "copa": {
+            "acc": 0.67,
+            "acc_stderr": 0.04725815626252607
+        },
+        "hellaswag": {
+            "acc": 0.3536148177653854,
+            "acc_stderr": 0.004771143074426132,
+            "acc_norm": 0.4255128460466043,
+            "acc_norm_stderr": 0.00493410077448122
+        },
+        "rte": {
+            "acc": 0.48014440433212996,
+            "acc_stderr": 0.0300727231673172
+        },
+        "winogrande": {
+            "acc": 0.500394632991318,
+            "acc_stderr": 0.014052481306049516
+        },
+        "storycloze_2016": {
+            "acc": 0.6264029930518439,
+            "acc_stderr": 0.011186849693644694
+        },
+        "boolq": {
+            "acc": 0.5770642201834862,
+            "acc_stderr": 0.008640558744656428
+        },
+        "arc_easy": {
+            "acc": 0.5618686868686869,
+            "acc_stderr": 0.010180937100600073,
+            "acc_norm": 0.5589225589225589,
+            "acc_norm_stderr": 0.01018829322104055
+        },
+        "arc_challenge": {
+            "acc": 0.257679180887372,
+            "acc_stderr": 0.012780770562768395,
+            "acc_norm": 0.2781569965870307,
+            "acc_norm_stderr": 0.013094469919538812
+        },
+        "sciq": {
+            "acc": 0.897,
+            "acc_stderr": 0.0096168333396958,
+            "acc_norm": 0.893,
+            "acc_norm_stderr": 0.009779910359847169
+        },
+        "piqa": {
+            "acc": 0.6838955386289445,
+            "acc_stderr": 0.010848148455700455,
+            "acc_norm": 0.6789989118607181,
+            "acc_norm_stderr": 0.010892641574707904
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file