diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c3e1b6e7a83c5205b789014871d63622599c38ad 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d3b49c58efc3db325047363ab02eb36b39a4df4d --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.25518818499117035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.020198929979072126}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.06424858701254539, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002540662604602708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.24194520608398293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005061888078063388}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.08713006240052369, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021163575152811052}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.029250897601530847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016159150314360084}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.11213489324648325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031431788617556264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.03968682582435309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012864152483850072}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.061593275111588554, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024488839830169046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2334118422350547, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0048787714459462665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08351980735691193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001978263321627184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06070440310915625, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024423712896334553}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.22684155273254836, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0046468714919551185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.08198324173226955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019772453103279655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..279aa5562f6049ba0c21cb8bfe9a1efc83ac034c --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.37770333810014706, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.036043662376688344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08688444048786848, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030316834829975468}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.25231836055300055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004707834740663268}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11029589045724046, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026302378640336893}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.039888024685296904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019445292240548378}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.12320682798737367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031690473734229127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.050696350493553845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001601172935799986}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.08067935525490526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028191568459744356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2400568623297162, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004443711432592927}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10289898300711169, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002349648102435476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.081852061098128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028519921028834602}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.24140934346009968, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044596632225301245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10416303503161863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024045282990398016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..050fb7686150a79b1f9d69f5eee7478a69a4fdca --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3829018871876502, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.022682302500048476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.09533308838612936, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031918959249739934}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.275555629679834, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004727965781775394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12236842718754035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030225942233901805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.04438815526628443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018839731929603098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.13438684673727228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032179061201762227}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05746066096268111, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018708723435892534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.08478065899787014, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026474348060842394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.25951028904964774, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004388247219710807}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11131192366136156, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025600729892227623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0870012622715152, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002779405775352401}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2612564683466686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004413760869764198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11327636810594607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00265570207245623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8cda3958caa93b6915a9aeb1a33f6f61ac9424d0 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.49438795846003614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03220480694116766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.11102978394749591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037258981840235512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.29324047704391387, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004929158066657188}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13661508516896614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033865188061428593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.05489349981744968, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002411444050666127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1460318403788494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0034080826963901003}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.06675560716724324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002152759177258534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0983510025951489, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031311578361653356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.27396911503915883, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004527803475867941}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.12331965784710698, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028547373168783547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.10118232307247621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032732569349238874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2768278339741749, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004591647175413129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1258905544050262, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029667713047718076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..395de540bdc94cb4417405c9db77c1d3990e0b6a --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5097531919668136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037846557427194026}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.12660445313737514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004069129304355684}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.30584134713679634, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004938665654397721}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15051290928420963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035962245168251086}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06380697763757867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025616190740023713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1531395402953802, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033953087678648087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07435510796633933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002254816031736983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.11139297723344309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003440135146254942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.28386785189457364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00447862149321582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.13483336166859466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0030388433299652576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.11506907251192842, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036096573854085593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.287989283624223, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004567845010036214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.13843888381409075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031866033556944046}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..93733e2f1fb3b926ecf26a0a90e3f0597d678067 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5493345438208933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03156507075706306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.13145167799279456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004208017486529403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.31431356632933516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004912309978055554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15523792842972664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003705043227083823}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06682039356587426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026734322118555824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1593483911606193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003488245506035694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07729991696510335, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002362198278661277}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.11521584250075316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0035483893462582828}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2911824630349328, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00449934920077821}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.13867709585369095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031268375342873036}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.11975844765120187, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037510579951047463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.29608094482109, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00457219328093606}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1428199679819932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032823766236227112}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1a130351ffbc82982aeb4d4e54b7093f7236b2a4 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.15983607856511065, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025552324547594915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22751168503589744, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030756037547284122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17176517291666785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002329017824094656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03630672989355793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010196024456578525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.052475014894037275, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001428315830366114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03932636998031157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009987340137274396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12519971826411405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019474345957730163}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1828331784278453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002460695997235263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13522236510382263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001715199895595129}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1479787200014008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024084994322680275}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21009537033174597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028681840700951855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15863624899200707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021698433809270567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.2908341082810773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08591697554573632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a6cbf0b356075d211058d195b5d28aaffe350aa4 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2074831448217252, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003524413251335538}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1603567541864711, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002358874310291858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1500600406332044, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019241989868486221}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.048193206661683564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020379873823770994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.030329812370434826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011532237790344163}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.02928962923956642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000993776980137765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.16593034599476272, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002952779433549126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1263395938894177, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018438888158069336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.11776814806946785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014763519012599205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.19732631307428722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034023899501457837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.15136030413475798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0021926321158690864}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.14183665338988743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001800224756397767}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.8024864455011091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09003925624860501}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9830ec634a489eba63f9c7596121a1386ce1a738 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.28596147697474456, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040250344618331995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1993639467154353, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002664319477833243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1947292213306491, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021918313174583916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08090209451722337, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024314880977671276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05022891340202324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013490243270254702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.049895782057071895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012048654152258728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22915557741483644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033926337363906784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15804196664911024, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002108622273540943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15366314427734978, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016931718188903863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2703085422322766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003864247203591127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.18781780308286916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025166845226557905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18325876600451932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020600409806209663}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.9479127015875357, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07796616677370145}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9fe1bce464c037bee4997bf17fed431a98d8bd1b --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.26271995584299296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004350571081047754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.17195910512668672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028930003693202434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17047407122919256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024381191437714608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07716157594080365, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025215008865493967}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04592292328627891, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014011960521198414}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04580407896668463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001240136840680315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2129040576929948, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036832280907527527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.13748507108736507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023122364311355848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1358429496475672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019120675392480628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.24858343528470517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0041894189677485535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1614787885865035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027151405967413447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.16017361729618249, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022880671169300927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.368296526557304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06715896563436964}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4cec243a53f0c20bce96dfb4352f9118005f1d21 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09226808587029187, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035951645035662144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.05378214914394185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021922619377159835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05527677687040673, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002075334915597304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.028532578970617953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018587749480454441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.01548143136364689, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010135388288224974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.015865483046525946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009374894411648104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07725545552877301, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031119081702266804}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.04393223408477368, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018056545763819988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04511311086771814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016959499204821453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08723167275286887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003438050373646056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.05010740359653005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020446842446827317}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05162745766582887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019361443382161162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.03702293319391888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008280750551799155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e9f1b692fd1a555bf68a2decfbc41020d1f254fa --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.01624748312843043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017343102214544073}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.008973352017838115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009874748509305067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.008715725437140403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008716993431018379}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.005494197830878659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000991214742335091}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.002558561959956041, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003855003265324329}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.002496646429114283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00036357062963770885}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.014045751930703699, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015619179482028879}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0075965284642917335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000846021103064854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0072939865630402565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007275988588633965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.015553958884576207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001678423693533492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.008515152310636652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009413374007224053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008294111648957665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008329997418165786}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 6.329549235648946e-15, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.98643056969485e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5dfd6795a4abb8b2bf99649305fe6a5e34de5e58 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 3.4385508172058055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04264456511068813}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.3745168009518315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0054770341977520625}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.29958589610084396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033801849814915222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.2767986999806955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032477671776355997}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.20297808777795995, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005522822982953256}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.10216853633525805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019971383737075166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.097088577058388, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017064511604233982}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.29931908976323446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005159547193434527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2284780514712017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023634097297777477}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.20603167711573117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002072804178131474}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.34463645738569776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005423162557411763}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.2657387698192473, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003165191193253911}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2467760500253612, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00298962192866607}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1eca0681e4146a3c61cb871e464589fd0f8f7074 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.578509877350227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10981566633879294}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5290497807915124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034031031148912877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.42449982582372703, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029796948529652516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4430699483900025, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024212750351764543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.24509073269483966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026669698385618284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19440129139428716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021684401687693156}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2028107162678877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020310061478079093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.38881065801490833, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029605217311991657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.31090092383731366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002452100105085375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.324119071682265, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020761088086072486}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.43465610390683307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032381620072339475}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.348028905687337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00275713194366831}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.36322533452022043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002355969259137101}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b2c79423124483e69790ccefb8cbc92b99eae611 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 13.973742807059493, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21933451006645593}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5463581275257614, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033109746053900106}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4599780026221357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002866451132503217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4734319432946387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002277730687363045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.26984908227570326, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026581113600308783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.22503111854291266, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022246934402712576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23112076704320592, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002029026468778622}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.40847493701179866, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029328594326759366}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3432828614570734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024768383332246274}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.35285117962076507, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002056984438278394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4597128692889795, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003192730444475765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.38674562932968487, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002735034124705561}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3979842693641427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002298871731116783}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7c3a6dc82fcb764283aa7e2f2c3b44f051f35945 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.533287312181702, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18656720619703301}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5451265264934796, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003147288201601649}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4635452060391057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002820441604942112}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.478142190779417, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002235263328285793}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2714328734244194, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002583914662030242}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23027470724704344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022949497645592134}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23634052735419925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020601993739155743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.40976835157500135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002821032707227765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3479477017249916, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002470400553707357}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3585015271693414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002048646341232902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.46121561084482127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003066679903899239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3924920651465138, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027314711241234945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.404637798062588, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002300343226701871}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..158e5f09e47ceb2e20027a9f8a308256c03a7b52 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.788126595166384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18690040009176595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5482226243244983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032124524377117303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.46345359030928057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028329342309959572}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.48045922670165075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002311569925954736}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.27446812632372075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026299321970613995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2307927377768704, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002303932783753952}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23850438072833094, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002104880750018495}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.41268229797839345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002885772570892295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34835409226615954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024733395312169725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.36091724958116495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021212744636126225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.46563464741217186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031424166236039795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39431106312311454, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027657116569750795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4083787027633123, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002378318389395946}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a95513b68fffa51821ce7f700771689bbecd1d37 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.816898114308659, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2578943709075645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5463965076188966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031073173050291266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.464786934938444, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027602961923400638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4813258409710542, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022155647522670184}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2741684381980198, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002611976915331882}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23130694014233213, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002261839435649896}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23912604624839506, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002085017928004898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4127791623423059, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027864194329252517}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35037713824014954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002413674972879658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3627614127879427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002049692110741263}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4665920267581706, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030279579394666637}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3970310588904633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026745772949146518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.41096147149083767, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022723564955560572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..17df8be1d9fcc555d15adefaf7c66d12201668ef --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13515325881990387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001988006386809129}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3082267363959881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004580722609646003}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18486846991878378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026472370081744873}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0272859934174315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000998594442193996}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06551213025624243, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024545590859311856}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03791043043165935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013771627135441805}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10194380290336473, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014645033080314515}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.23262788209228538, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003415563039987054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.13935422276606838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019388171493750941}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10728605862144046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016390380124941084}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.24560801323409917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038465518145899083}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.14683463366193603, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021884358384104898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.42436283894258, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0809652794130641}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..550ba355d38f3d32e4b219d94dc978ccbec22099 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.18034557721867206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003378966024270665}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.25165661054563454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004219363793342944}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19377928742866415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029318979888479386}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03174129360340327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016980099731496725}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.04525380270312792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021325289082326157}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03391705069355955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001600827686299807}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1348915392123015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002633227095382269}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18737022577489543, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031416529379445714}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14433798493238226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022145818855320775}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1387175358944472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026549430237383285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.19658579721494118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003531925441008354}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.14964383560266273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002331396812470961}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5072643038464826, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07581189432866778}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a5a7a5a3e791126c9bd19c707d69e0a21bf0b082 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.22068687589610897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003789951247255284}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.25078858709670043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039415090299140245}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21899687156092093, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031785578491904183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04612059160046461, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00216246272019255}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05136702902319479, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002315418839001681}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04511500976844758, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001986723782931247}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1666654493818604, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002931741970841584}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.19015440175110393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030722371403492073}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16528451206309597, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002448179539908828}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1684803863889924, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029515578981164007}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.19342460055959376, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032586706636470127}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16744440121325288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024991880630578747}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.135517530489009, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09395977154683074}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0e1d5c261369348c364bad41220a6187a9a37f84 --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.21755397454942801, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040197643665198}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.24395196225361002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0041819563633223285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21441285271309635, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034688162829816867}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04608502015312911, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021286532637906437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05204692726094356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002308596850566173}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0455324464868029, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019857349620838816}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.16247030281934602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031264236020145395}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18178458588619653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031938440097234347}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.159773564475991, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002683043459624172}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16495789870371907, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031401120246905712}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.18683194228664585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034187194500331993}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16297000957614646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002742910817454915}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.306953746485347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08640815655966855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5752947d731521da9d8a61e042e07a9f0e3246ff --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.05735338761210453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035071456355189675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.059021496533637884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036712685384006774}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05269608047169612, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003098859012482462}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.010781643643676288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011947055672951296}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01223156205613527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013826375879066437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.010458434372266581, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011178203055454239}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.04368080562194709, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002762386413099355}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04437717860143795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028397295075813945}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.03966278292494188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023832058820488743}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.04449065152016272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002797505308089145}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04583958323736499, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029805207884232624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04062836833112145, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024442501537786706}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.21661324901347828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07403289458610106}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a63823a46177f89394c44da0c9e7b0c9bdd5203a --- /dev/null +++ b/evaluation/generation/agg.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.003544629362742575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009485977995739016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0037550498645595996, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011352337360716696}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0028901136641309804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007628453825537149}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0006426177000561377, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00026640442181651204}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0013262549819153596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006559432301667889}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0007826161154573944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003313480806525455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.002882853322968734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007611952222133634}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.003139944055164728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009749927515076235}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.002389689764554573, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006340237380334276}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002902333778437242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007672460026879215}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0031769694879492582, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010086213776490023}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.00241629675514098, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006494890213851666}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5974310709826073e-21, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.3100992922519676e-15}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a41a8a4c1acb8022efbfb60b43b30c7270084159 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2451355dbc6ecf61ecf6f46b5f1533a0f35e15062fefc4af6d5ace09dd81e825 +size 3982993 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a85dc2467ef025e9862187fb0e6049ac01592c1 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23178510788e0243ab26492b29e3671ce8ca27db2d17e238a6c4bedf0ee22047 +size 4847367 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4295d648f8ceaa61ea810839eeedf49a0170c24c --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9aef74c3ffb6b3f4d2f16584e518b29506b7412fceef949dbac8fdd712638d8 +size 5811137 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9b911ffbe9be177c86b054a4af8ef079203657ea --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c20731be143fe20dbbe77d164b4db9ca631e3da5e9f5ddcad4b12a512bcf5015 +size 6700805 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..27289802d6deb6ee2116d2cfe3aacfba24eceeaa --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ca4818a55ccfccc013eadd26b08d5f7f44ceabb8f07e7435e50a1d715db8d84 +size 7580127 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce5b0d423f7b54af3573bd001d09db1c0a91b323 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f629a9199998eda2cf2bee34e15475fab7ea92a6727ef1cba82d8810878fc053 +size 8462227 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8d245f6fa6ba003a7c65db56d822592b20fa992 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f25b37c336043fa5c942f06070dabae3eda2634075036d8bb7c76b4da22bc35 +size 7589559 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..214e4bc207bc16f43c710b3f03535642e11d9182 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9899f03ed89bbc592e12294b60c071b64190245407a3df24298ef4cb761bd01a +size 12969553 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..389012d27858b97239a08fb2d528fa9862a43a83 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0564614129e53accb30341237f422a050c5f39b64c4ca37a0eeb3843f97b44ad +size 18545638 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b09a5b4026a0890bb513c2fe60f7c71aade15bce --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22737aec22a24fe6567f7e22b9ab268bc577dbab3b2524725fbd71ee2a11ec0d +size 24016258 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4018ce40eadb01ff48fb3ba8792a2c904b00333d --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd56c4b2e1ff5acd018879af81d0ec33f603367a67c7f50f2ec8e4e50736060 +size 29355453 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1aa5dc3aa9ee3f41dfd4fc347e810be6c82f0529 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8004d231e833bafa9d38c63bfbf62d1e9c6a0b982e16d3f716255f018b7dff53 +size 34782772 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aa6222a010f18f1d9b42e22cd856b8986e60c119 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bbac395e2abf05b8b7c689c78354e489494bf791daf5256784a57947cf7972e +size 4217659 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..abe3637b815e9481d0343327113523960d894db1 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8ac3cc5ae134bf0187190eb779207a918003fb8f379af03a88a22534796fff1 +size 5028135 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..44646e5534b19453444cabdd7215095b12a4c6fa --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f092e42a367c56fbe377d777c6c18ef8a1928efdd88651077c33ed7927c3b43 +size 6122993 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f21291b86c1efcbf055aba82ee3fd7f344339ee7 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dd34d8eea7e2b960334cca235ea55528556eb83dbb2cd7744d9daba6fb751e4 +size 7200825 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8289c1874190eff2191578ccc62fe3f6f178c5e9 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2296383a810f92aa74a80a07a7e5d67d4adac62484c7feecbc93531f70df0b1c +size 8275191 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dd0bd3503c6085ade28e02a9c4a45c7aa7fb6d85 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04584777b039d65cac9d9d98855a403e6fe8a9f260a2650ce50bbfec5a7912e7 +size 9356862 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca34961958e6094f9edf4b736cd94f736714613b --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9cfcf8eef08371fd49572807910296aa903cf721830dbc4df77e605be85ee2c +size 2819469 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc476ad151cf5bcf8b0c0c8bf960b435efda3e8f --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ea84d7e818dc8c98f494d44ac84ce557a5d34a2b6d9e6a8741071ba146694bc +size 4982442 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a03fbe35db94e1620e797de640c9d9412de4a9ee --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2692fb1e98d5a9704d5ee926d8424d33b27a10b79ef4f8fad33afb476cc34e9 +size 7217364 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0b3efd82baaf56ab0fe6029fe16ce6de3bd6999 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f504bfb2a2b0015a174b6f17d7e792302307943f5b470cafba4d9630bff7a33b +size 9495458 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be8a7a030559dffe658e30b5269214fb713e78ea --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e5d9474f2f265f545a5671d21013a7a7e0fbfe1aed37fb8e753f8475a10a649 +size 11637450 diff --git a/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e895bccdffeec724f4f4c7930e4799f9be936d09 --- /dev/null +++ b/evaluation/generation/examples.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eed5c618b2c15d3ed7c38c0651658c75f055dee97f605df453c271d9e3769a41 +size 13865437 diff --git a/evaluation/generation/merged.csv b/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..3331c0e795eab19509b38bb7723dec470c7a3e6d --- /dev/null +++ b/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.097088577058388 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.097088577058388 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2028107162678877 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2028107162678877 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.23112076704320592 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.23112076704320592 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23634052735419925 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23634052735419925 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23850438072833094 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23850438072833094 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23912604624839506 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23912604624839506 +e2e_nlg_cleaned,5,average,multiple,0.20749850245006782 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03791043043165935 +gem_xsum,0,median,rouge2_fmeasure,0.03791043043165935 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03391705069355955 +gem_xsum,1,median,rouge2_fmeasure,0.03391705069355955 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04511500976844758 +gem_xsum,2,median,rouge2_fmeasure,0.04511500976844758 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.0455324464868029 +gem_xsum,3,median,rouge2_fmeasure,0.0455324464868029 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010458434372266581 +gem_xsum,4,median,rouge2_fmeasure,0.010458434372266581 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0007826161154573944 +gem_xsum,5,median,rouge2_fmeasure,0.0007826161154573944 +gem_xsum,5,average,multiple,0.02895266464469889 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.03968682582435309 +web_nlg_en,0,median,rouge2_fmeasure,0.03968682582435309 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.050696350493553845 +web_nlg_en,1,median,rouge2_fmeasure,0.050696350493553845 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05746066096268111 +web_nlg_en,2,median,rouge2_fmeasure,0.05746066096268111 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.06675560716724324 +web_nlg_en,3,median,rouge2_fmeasure,0.06675560716724324 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.07435510796633933 +web_nlg_en,4,median,rouge2_fmeasure,0.07435510796633933 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.07729991696510335 +web_nlg_en,5,median,rouge2_fmeasure,0.07729991696510335 +web_nlg_en,5,average,multiple,0.06104241156321233 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03932636998031157 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03932636998031157 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.02928962923956642 +wiki_lingua_en,1,median,rouge2_fmeasure,0.02928962923956642 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.049895782057071895 +wiki_lingua_en,2,median,rouge2_fmeasure,0.049895782057071895 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04580407896668463 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04580407896668463 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015865483046525946 +wiki_lingua_en,4,median,rouge2_fmeasure,0.015865483046525946 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002496646429114283 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002496646429114283 +wiki_lingua_en,5,average,multiple,0.030446331619879124 diff --git a/evaluation/generation/merged.json b/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..5385baaada544a386d2284a7aa9fab2b6fb4c218 --- /dev/null +++ b/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.25518818499117035, "bleu_stderr": 0.020198929979072126, "rouge1_fmeasure": 0.08713006240052369, "rouge1_fmeasure_stderr": 0.0021163575152811052, "rouge1_precision": 0.06424858701254539, "rouge1_precision_stderr": 0.002540662604602708, "rouge1_recall": 0.24194520608398293, "rouge1_recall_stderr": 0.005061888078063388, "rouge2_fmeasure": 0.03968682582435309, "rouge2_fmeasure_stderr": 0.0012864152483850072, "rouge2_precision": 0.029250897601530847, "rouge2_precision_stderr": 0.0016159150314360084, "rouge2_recall": 0.11213489324648325, "rouge2_recall_stderr": 0.0031431788617556264, "rougeL_fmeasure": 0.08351980735691193, "rougeL_fmeasure_stderr": 0.001978263321627184, "rougeL_precision": 0.061593275111588554, "rougeL_precision_stderr": 0.0024488839830169046, "rougeL_recall": 0.2334118422350547, "rougeL_recall_stderr": 0.0048787714459462665, "rougeLsum_fmeasure": 0.08198324173226955, "rougeLsum_fmeasure_stderr": 0.0019772453103279655, "rougeLsum_precision": 0.06070440310915625, "rougeLsum_precision_stderr": 0.0024423712896334553, "rougeLsum_recall": 0.22684155273254836, "rougeLsum_recall_stderr": 0.0046468714919551185}}, "1": {"PALM_prompt": {"bleu": 0.37770333810014706, "bleu_stderr": 0.036043662376688344, "rouge1_fmeasure": 0.11029589045724046, "rouge1_fmeasure_stderr": 0.0026302378640336893, "rouge1_precision": 0.08688444048786848, "rouge1_precision_stderr": 0.0030316834829975468, "rouge1_recall": 0.25231836055300055, "rouge1_recall_stderr": 0.004707834740663268, "rouge2_fmeasure": 0.050696350493553845, "rouge2_fmeasure_stderr": 0.001601172935799986, "rouge2_precision": 0.039888024685296904, "rouge2_precision_stderr": 0.0019445292240548378, "rouge2_recall": 0.12320682798737367, "rouge2_recall_stderr": 0.0031690473734229127, "rougeL_fmeasure": 0.10289898300711169, "rougeL_fmeasure_stderr": 0.002349648102435476, "rougeL_precision": 0.08067935525490526, "rougeL_precision_stderr": 0.0028191568459744356, "rougeL_recall": 0.2400568623297162, "rougeL_recall_stderr": 0.004443711432592927, "rougeLsum_fmeasure": 0.10416303503161863, "rougeLsum_fmeasure_stderr": 0.0024045282990398016, "rougeLsum_precision": 0.081852061098128, "rougeLsum_precision_stderr": 0.0028519921028834602, "rougeLsum_recall": 0.24140934346009968, "rougeLsum_recall_stderr": 0.0044596632225301245}}, "2": {"PALM_prompt": {"bleu": 0.3829018871876502, "bleu_stderr": 0.022682302500048476, "rouge1_fmeasure": 0.12236842718754035, "rouge1_fmeasure_stderr": 0.0030225942233901805, "rouge1_precision": 0.09533308838612936, "rouge1_precision_stderr": 0.0031918959249739934, "rouge1_recall": 0.275555629679834, "rouge1_recall_stderr": 0.004727965781775394, "rouge2_fmeasure": 0.05746066096268111, "rouge2_fmeasure_stderr": 0.0018708723435892534, "rouge2_precision": 0.04438815526628443, "rouge2_precision_stderr": 0.0018839731929603098, "rouge2_recall": 0.13438684673727228, "rouge2_recall_stderr": 0.0032179061201762227, "rougeL_fmeasure": 0.11131192366136156, "rougeL_fmeasure_stderr": 0.0025600729892227623, "rougeL_precision": 0.08478065899787014, "rougeL_precision_stderr": 0.0026474348060842394, "rougeL_recall": 0.25951028904964774, "rougeL_recall_stderr": 0.004388247219710807, "rougeLsum_fmeasure": 0.11327636810594607, "rougeLsum_fmeasure_stderr": 0.00265570207245623, "rougeLsum_precision": 0.0870012622715152, "rougeLsum_precision_stderr": 0.002779405775352401, "rougeLsum_recall": 0.2612564683466686, "rougeLsum_recall_stderr": 0.004413760869764198}}, "3": {"PALM_prompt": {"bleu": 0.49438795846003614, "bleu_stderr": 0.03220480694116766, "rouge1_fmeasure": 0.13661508516896614, "rouge1_fmeasure_stderr": 0.0033865188061428593, "rouge1_precision": 0.11102978394749591, "rouge1_precision_stderr": 0.0037258981840235512, "rouge1_recall": 0.29324047704391387, "rouge1_recall_stderr": 0.004929158066657188, "rouge2_fmeasure": 0.06675560716724324, "rouge2_fmeasure_stderr": 0.002152759177258534, "rouge2_precision": 0.05489349981744968, "rouge2_precision_stderr": 0.002411444050666127, "rouge2_recall": 0.1460318403788494, "rouge2_recall_stderr": 0.0034080826963901003, "rougeL_fmeasure": 0.12331965784710698, "rougeL_fmeasure_stderr": 0.0028547373168783547, "rougeL_precision": 0.0983510025951489, "rougeL_precision_stderr": 0.0031311578361653356, "rougeL_recall": 0.27396911503915883, "rougeL_recall_stderr": 0.004527803475867941, "rougeLsum_fmeasure": 0.1258905544050262, "rougeLsum_fmeasure_stderr": 0.0029667713047718076, "rougeLsum_precision": 0.10118232307247621, "rougeLsum_precision_stderr": 0.0032732569349238874, "rougeLsum_recall": 0.2768278339741749, "rougeLsum_recall_stderr": 0.004591647175413129}}, "4": {"PALM_prompt": {"bleu": 0.5097531919668136, "bleu_stderr": 0.037846557427194026, "rouge1_fmeasure": 0.15051290928420963, "rouge1_fmeasure_stderr": 0.0035962245168251086, "rouge1_precision": 0.12660445313737514, "rouge1_precision_stderr": 0.004069129304355684, "rouge1_recall": 0.30584134713679634, "rouge1_recall_stderr": 0.004938665654397721, "rouge2_fmeasure": 0.07435510796633933, "rouge2_fmeasure_stderr": 0.002254816031736983, "rouge2_precision": 0.06380697763757867, "rouge2_precision_stderr": 0.0025616190740023713, "rouge2_recall": 0.1531395402953802, "rouge2_recall_stderr": 0.0033953087678648087, "rougeL_fmeasure": 0.13483336166859466, "rougeL_fmeasure_stderr": 0.0030388433299652576, "rougeL_precision": 0.11139297723344309, "rougeL_precision_stderr": 0.003440135146254942, "rougeL_recall": 0.28386785189457364, "rougeL_recall_stderr": 0.00447862149321582, "rougeLsum_fmeasure": 0.13843888381409075, "rougeLsum_fmeasure_stderr": 0.0031866033556944046, "rougeLsum_precision": 0.11506907251192842, "rougeLsum_precision_stderr": 0.0036096573854085593, "rougeLsum_recall": 0.287989283624223, "rougeLsum_recall_stderr": 0.004567845010036214}}, "5": {"PALM_prompt": {"bleu": 0.5493345438208933, "bleu_stderr": 0.03156507075706306, "rouge1_fmeasure": 0.15523792842972664, "rouge1_fmeasure_stderr": 0.003705043227083823, "rouge1_precision": 0.13145167799279456, "rouge1_precision_stderr": 0.004208017486529403, "rouge1_recall": 0.31431356632933516, "rouge1_recall_stderr": 0.004912309978055554, "rouge2_fmeasure": 0.07729991696510335, "rouge2_fmeasure_stderr": 0.002362198278661277, "rouge2_precision": 0.06682039356587426, "rouge2_precision_stderr": 0.0026734322118555824, "rouge2_recall": 0.1593483911606193, "rouge2_recall_stderr": 0.003488245506035694, "rougeL_fmeasure": 0.13867709585369095, "rougeL_fmeasure_stderr": 0.0031268375342873036, "rougeL_precision": 0.11521584250075316, "rougeL_precision_stderr": 0.0035483893462582828, "rougeL_recall": 0.2911824630349328, "rougeL_recall_stderr": 0.00449934920077821, "rougeLsum_fmeasure": 0.1428199679819932, "rougeLsum_fmeasure_stderr": 0.0032823766236227112, "rougeLsum_precision": 0.11975844765120187, "rougeLsum_precision_stderr": 0.0037510579951047463, "rougeLsum_recall": 0.29608094482109, "rougeLsum_recall_stderr": 0.00457219328093606}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.2908341082810773, "bleu_stderr": 0.08591697554573632, "rouge1_fmeasure": 0.17176517291666785, "rouge1_fmeasure_stderr": 0.002329017824094656, "rouge1_precision": 0.15983607856511065, "rouge1_precision_stderr": 0.0025552324547594915, "rouge1_recall": 0.22751168503589744, "rouge1_recall_stderr": 0.0030756037547284122, "rouge2_fmeasure": 0.03932636998031157, "rouge2_fmeasure_stderr": 0.0009987340137274396, "rouge2_precision": 0.03630672989355793, "rouge2_precision_stderr": 0.0010196024456578525, "rouge2_recall": 0.052475014894037275, "rouge2_recall_stderr": 0.001428315830366114, "rougeL_fmeasure": 0.13522236510382263, "rougeL_fmeasure_stderr": 0.001715199895595129, "rougeL_precision": 0.12519971826411405, "rougeL_precision_stderr": 0.0019474345957730163, "rougeL_recall": 0.1828331784278453, "rougeL_recall_stderr": 0.002460695997235263, "rougeLsum_fmeasure": 0.15863624899200707, "rougeLsum_fmeasure_stderr": 0.0021698433809270567, "rougeLsum_precision": 0.1479787200014008, "rougeLsum_precision_stderr": 0.0024084994322680275, "rougeLsum_recall": 0.21009537033174597, "rougeLsum_recall_stderr": 0.0028681840700951855}}, "1": {"tldr_en": {"bleu": 1.8024864455011091, "bleu_stderr": 0.09003925624860501, "rouge1_fmeasure": 0.1500600406332044, "rouge1_fmeasure_stderr": 0.0019241989868486221, "rouge1_precision": 0.2074831448217252, "rouge1_precision_stderr": 0.003524413251335538, "rouge1_recall": 0.1603567541864711, "rouge1_recall_stderr": 0.002358874310291858, "rouge2_fmeasure": 0.02928962923956642, "rouge2_fmeasure_stderr": 0.000993776980137765, "rouge2_precision": 0.048193206661683564, "rouge2_precision_stderr": 0.0020379873823770994, "rouge2_recall": 0.030329812370434826, "rouge2_recall_stderr": 0.0011532237790344163, "rougeL_fmeasure": 0.11776814806946785, "rougeL_fmeasure_stderr": 0.0014763519012599205, "rougeL_precision": 0.16593034599476272, "rougeL_precision_stderr": 0.002952779433549126, "rougeL_recall": 0.1263395938894177, "rougeL_recall_stderr": 0.0018438888158069336, "rougeLsum_fmeasure": 0.14183665338988743, "rougeLsum_fmeasure_stderr": 0.001800224756397767, "rougeLsum_precision": 0.19732631307428722, "rougeLsum_precision_stderr": 0.0034023899501457837, "rougeLsum_recall": 0.15136030413475798, "rougeLsum_recall_stderr": 0.0021926321158690864}}, "2": {"tldr_en": {"bleu": 2.9479127015875357, "bleu_stderr": 0.07796616677370145, "rouge1_fmeasure": 0.1947292213306491, "rouge1_fmeasure_stderr": 0.0021918313174583916, "rouge1_precision": 0.28596147697474456, "rouge1_precision_stderr": 0.0040250344618331995, "rouge1_recall": 0.1993639467154353, "rouge1_recall_stderr": 0.002664319477833243, "rouge2_fmeasure": 0.049895782057071895, "rouge2_fmeasure_stderr": 0.0012048654152258728, "rouge2_precision": 0.08090209451722337, "rouge2_precision_stderr": 0.0024314880977671276, "rouge2_recall": 0.05022891340202324, "rouge2_recall_stderr": 0.0013490243270254702, "rougeL_fmeasure": 0.15366314427734978, "rougeL_fmeasure_stderr": 0.0016931718188903863, "rougeL_precision": 0.22915557741483644, "rougeL_precision_stderr": 0.0033926337363906784, "rougeL_recall": 0.15804196664911024, "rougeL_recall_stderr": 0.002108622273540943, "rougeLsum_fmeasure": 0.18325876600451932, "rougeLsum_fmeasure_stderr": 0.0020600409806209663, "rougeLsum_precision": 0.2703085422322766, "rougeLsum_precision_stderr": 0.003864247203591127, "rougeLsum_recall": 0.18781780308286916, "rougeLsum_recall_stderr": 0.0025166845226557905}}, "3": {"tldr_en": {"bleu": 2.368296526557304, "bleu_stderr": 0.06715896563436964, "rouge1_fmeasure": 0.17047407122919256, "rouge1_fmeasure_stderr": 0.0024381191437714608, "rouge1_precision": 0.26271995584299296, "rouge1_precision_stderr": 0.004350571081047754, "rouge1_recall": 0.17195910512668672, "rouge1_recall_stderr": 0.0028930003693202434, "rouge2_fmeasure": 0.04580407896668463, "rouge2_fmeasure_stderr": 0.001240136840680315, "rouge2_precision": 0.07716157594080365, "rouge2_precision_stderr": 0.0025215008865493967, "rouge2_recall": 0.04592292328627891, "rouge2_recall_stderr": 0.0014011960521198414, "rougeL_fmeasure": 0.1358429496475672, "rougeL_fmeasure_stderr": 0.0019120675392480628, "rougeL_precision": 0.2129040576929948, "rougeL_precision_stderr": 0.0036832280907527527, "rougeL_recall": 0.13748507108736507, "rougeL_recall_stderr": 0.0023122364311355848, "rougeLsum_fmeasure": 0.16017361729618249, "rougeLsum_fmeasure_stderr": 0.0022880671169300927, "rougeLsum_precision": 0.24858343528470517, "rougeLsum_precision_stderr": 0.0041894189677485535, "rougeLsum_recall": 0.1614787885865035, "rougeLsum_recall_stderr": 0.0027151405967413447}}, "4": {"tldr_en": {"bleu": 0.03702293319391888, "bleu_stderr": 0.008280750551799155, "rouge1_fmeasure": 0.05527677687040673, "rouge1_fmeasure_stderr": 0.002075334915597304, "rouge1_precision": 0.09226808587029187, "rouge1_precision_stderr": 0.0035951645035662144, "rouge1_recall": 0.05378214914394185, "rouge1_recall_stderr": 0.0021922619377159835, "rouge2_fmeasure": 0.015865483046525946, "rouge2_fmeasure_stderr": 0.0009374894411648104, "rouge2_precision": 0.028532578970617953, "rouge2_precision_stderr": 0.0018587749480454441, "rouge2_recall": 0.01548143136364689, "rouge2_recall_stderr": 0.0010135388288224974, "rougeL_fmeasure": 0.04511311086771814, "rougeL_fmeasure_stderr": 0.0016959499204821453, "rougeL_precision": 0.07725545552877301, "rougeL_precision_stderr": 0.0031119081702266804, "rougeL_recall": 0.04393223408477368, "rougeL_recall_stderr": 0.0018056545763819988, "rougeLsum_fmeasure": 0.05162745766582887, "rougeLsum_fmeasure_stderr": 0.0019361443382161162, "rougeLsum_precision": 0.08723167275286887, "rougeLsum_precision_stderr": 0.003438050373646056, "rougeLsum_recall": 0.05010740359653005, "rougeLsum_recall_stderr": 0.0020446842446827317}}, "5": {"tldr_en": {"bleu": 6.329549235648946e-15, "bleu_stderr": 6.98643056969485e-14, "rouge1_fmeasure": 0.008715725437140403, "rouge1_fmeasure_stderr": 0.0008716993431018379, "rouge1_precision": 0.01624748312843043, "rouge1_precision_stderr": 0.0017343102214544073, "rouge1_recall": 0.008973352017838115, "rouge1_recall_stderr": 0.0009874748509305067, "rouge2_fmeasure": 0.002496646429114283, "rouge2_fmeasure_stderr": 0.00036357062963770885, "rouge2_precision": 0.005494197830878659, "rouge2_precision_stderr": 0.000991214742335091, "rouge2_recall": 0.002558561959956041, "rouge2_recall_stderr": 0.0003855003265324329, "rougeL_fmeasure": 0.0072939865630402565, "rougeL_fmeasure_stderr": 0.0007275988588633965, "rougeL_precision": 0.014045751930703699, "rougeL_precision_stderr": 0.0015619179482028879, "rougeL_recall": 0.0075965284642917335, "rougeL_recall_stderr": 0.000846021103064854, "rougeLsum_fmeasure": 0.008294111648957665, "rougeLsum_fmeasure_stderr": 0.0008329997418165786, "rougeLsum_precision": 0.015553958884576207, "rougeLsum_precision_stderr": 0.001678423693533492, "rougeLsum_recall": 0.008515152310636652, "rougeLsum_recall_stderr": 0.0009413374007224053}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.4385508172058055, "bleu_stderr": 0.04264456511068813, "rouge1_fmeasure": 0.2767986999806955, "rouge1_fmeasure_stderr": 0.0032477671776355997, "rouge1_precision": 0.3745168009518315, "rouge1_precision_stderr": 0.0054770341977520625, "rouge1_recall": 0.29958589610084396, "rouge1_recall_stderr": 0.0033801849814915222, "rouge2_fmeasure": 0.097088577058388, "rouge2_fmeasure_stderr": 0.0017064511604233982, "rouge2_precision": 0.20297808777795995, "rouge2_precision_stderr": 0.005522822982953256, "rouge2_recall": 0.10216853633525805, "rouge2_recall_stderr": 0.0019971383737075166, "rougeL_fmeasure": 0.20603167711573117, "rougeL_fmeasure_stderr": 0.002072804178131474, "rougeL_precision": 0.29931908976323446, "rougeL_precision_stderr": 0.005159547193434527, "rougeL_recall": 0.2284780514712017, "rougeL_recall_stderr": 0.0023634097297777477, "rougeLsum_fmeasure": 0.2467760500253612, "rougeLsum_fmeasure_stderr": 0.00298962192866607, "rougeLsum_precision": 0.34463645738569776, "rougeLsum_precision_stderr": 0.005423162557411763, "rougeLsum_recall": 0.2657387698192473, "rougeLsum_recall_stderr": 0.003165191193253911}}, "1": {"generate_text_restaurant": {"bleu": 11.578509877350227, "bleu_stderr": 0.10981566633879294, "rouge1_fmeasure": 0.4430699483900025, "rouge1_fmeasure_stderr": 0.0024212750351764543, "rouge1_precision": 0.5290497807915124, "rouge1_precision_stderr": 0.0034031031148912877, "rouge1_recall": 0.42449982582372703, "rouge1_recall_stderr": 0.0029796948529652516, "rouge2_fmeasure": 0.2028107162678877, "rouge2_fmeasure_stderr": 0.0020310061478079093, "rouge2_precision": 0.24509073269483966, "rouge2_precision_stderr": 0.0026669698385618284, "rouge2_recall": 0.19440129139428716, "rouge2_recall_stderr": 0.0021684401687693156, "rougeL_fmeasure": 0.324119071682265, "rougeL_fmeasure_stderr": 0.0020761088086072486, "rougeL_precision": 0.38881065801490833, "rougeL_precision_stderr": 0.0029605217311991657, "rougeL_recall": 0.31090092383731366, "rougeL_recall_stderr": 0.002452100105085375, "rougeLsum_fmeasure": 0.36322533452022043, "rougeLsum_fmeasure_stderr": 0.002355969259137101, "rougeLsum_precision": 0.43465610390683307, "rougeLsum_precision_stderr": 0.0032381620072339475, "rougeLsum_recall": 0.348028905687337, "rougeLsum_recall_stderr": 0.00275713194366831}}, "2": {"generate_text_restaurant": {"bleu": 13.973742807059493, "bleu_stderr": 0.21933451006645593, "rouge1_fmeasure": 0.4734319432946387, "rouge1_fmeasure_stderr": 0.002277730687363045, "rouge1_precision": 0.5463581275257614, "rouge1_precision_stderr": 0.0033109746053900106, "rouge1_recall": 0.4599780026221357, "rouge1_recall_stderr": 0.002866451132503217, "rouge2_fmeasure": 0.23112076704320592, "rouge2_fmeasure_stderr": 0.002029026468778622, "rouge2_precision": 0.26984908227570326, "rouge2_precision_stderr": 0.0026581113600308783, "rouge2_recall": 0.22503111854291266, "rouge2_recall_stderr": 0.0022246934402712576, "rougeL_fmeasure": 0.35285117962076507, "rougeL_fmeasure_stderr": 0.002056984438278394, "rougeL_precision": 0.40847493701179866, "rougeL_precision_stderr": 0.0029328594326759366, "rougeL_recall": 0.3432828614570734, "rougeL_recall_stderr": 0.0024768383332246274, "rougeLsum_fmeasure": 0.3979842693641427, "rougeLsum_fmeasure_stderr": 0.002298871731116783, "rougeLsum_precision": 0.4597128692889795, "rougeLsum_precision_stderr": 0.003192730444475765, "rougeLsum_recall": 0.38674562932968487, "rougeLsum_recall_stderr": 0.002735034124705561}}, "3": {"generate_text_restaurant": {"bleu": 14.533287312181702, "bleu_stderr": 0.18656720619703301, "rouge1_fmeasure": 0.478142190779417, "rouge1_fmeasure_stderr": 0.002235263328285793, "rouge1_precision": 0.5451265264934796, "rouge1_precision_stderr": 0.003147288201601649, "rouge1_recall": 0.4635452060391057, "rouge1_recall_stderr": 0.002820441604942112, "rouge2_fmeasure": 0.23634052735419925, "rouge2_fmeasure_stderr": 0.0020601993739155743, "rouge2_precision": 0.2714328734244194, "rouge2_precision_stderr": 0.002583914662030242, "rouge2_recall": 0.23027470724704344, "rouge2_recall_stderr": 0.0022949497645592134, "rougeL_fmeasure": 0.3585015271693414, "rougeL_fmeasure_stderr": 0.002048646341232902, "rougeL_precision": 0.40976835157500135, "rougeL_precision_stderr": 0.002821032707227765, "rougeL_recall": 0.3479477017249916, "rougeL_recall_stderr": 0.002470400553707357, "rougeLsum_fmeasure": 0.404637798062588, "rougeLsum_fmeasure_stderr": 0.002300343226701871, "rougeLsum_precision": 0.46121561084482127, "rougeLsum_precision_stderr": 0.003066679903899239, "rougeLsum_recall": 0.3924920651465138, "rougeLsum_recall_stderr": 0.0027314711241234945}}, "4": {"generate_text_restaurant": {"bleu": 14.788126595166384, "bleu_stderr": 0.18690040009176595, "rouge1_fmeasure": 0.48045922670165075, "rouge1_fmeasure_stderr": 0.002311569925954736, "rouge1_precision": 0.5482226243244983, "rouge1_precision_stderr": 0.0032124524377117303, "rouge1_recall": 0.46345359030928057, "rouge1_recall_stderr": 0.0028329342309959572, "rouge2_fmeasure": 0.23850438072833094, "rouge2_fmeasure_stderr": 0.002104880750018495, "rouge2_precision": 0.27446812632372075, "rouge2_precision_stderr": 0.0026299321970613995, "rouge2_recall": 0.2307927377768704, "rouge2_recall_stderr": 0.002303932783753952, "rougeL_fmeasure": 0.36091724958116495, "rougeL_fmeasure_stderr": 0.0021212744636126225, "rougeL_precision": 0.41268229797839345, "rougeL_precision_stderr": 0.002885772570892295, "rougeL_recall": 0.34835409226615954, "rougeL_recall_stderr": 0.0024733395312169725, "rougeLsum_fmeasure": 0.4083787027633123, "rougeLsum_fmeasure_stderr": 0.002378318389395946, "rougeLsum_precision": 0.46563464741217186, "rougeLsum_precision_stderr": 0.0031424166236039795, "rougeLsum_recall": 0.39431106312311454, "rougeLsum_recall_stderr": 0.0027657116569750795}}, "5": {"generate_text_restaurant": {"bleu": 14.816898114308659, "bleu_stderr": 0.2578943709075645, "rouge1_fmeasure": 0.4813258409710542, "rouge1_fmeasure_stderr": 0.0022155647522670184, "rouge1_precision": 0.5463965076188966, "rouge1_precision_stderr": 0.0031073173050291266, "rouge1_recall": 0.464786934938444, "rouge1_recall_stderr": 0.0027602961923400638, "rouge2_fmeasure": 0.23912604624839506, "rouge2_fmeasure_stderr": 0.002085017928004898, "rouge2_precision": 0.2741684381980198, "rouge2_precision_stderr": 0.002611976915331882, "rouge2_recall": 0.23130694014233213, "rouge2_recall_stderr": 0.002261839435649896, "rougeL_fmeasure": 0.3627614127879427, "rougeL_fmeasure_stderr": 0.002049692110741263, "rougeL_precision": 0.4127791623423059, "rougeL_precision_stderr": 0.0027864194329252517, "rougeL_recall": 0.35037713824014954, "rougeL_recall_stderr": 0.002413674972879658, "rougeLsum_fmeasure": 0.41096147149083767, "rougeLsum_fmeasure_stderr": 0.0022723564955560572, "rougeLsum_precision": 0.4665920267581706, "rougeLsum_precision_stderr": 0.0030279579394666637, "rougeLsum_recall": 0.3970310588904633, "rougeLsum_recall_stderr": 0.0026745772949146518}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.42436283894258, "bleu_stderr": 0.0809652794130641, "rouge1_fmeasure": 0.18486846991878378, "rouge1_fmeasure_stderr": 0.0026472370081744873, "rouge1_precision": 0.13515325881990387, "rouge1_precision_stderr": 0.001988006386809129, "rouge1_recall": 0.3082267363959881, "rouge1_recall_stderr": 0.004580722609646003, "rouge2_fmeasure": 0.03791043043165935, "rouge2_fmeasure_stderr": 0.0013771627135441805, "rouge2_precision": 0.0272859934174315, "rouge2_precision_stderr": 0.000998594442193996, "rouge2_recall": 0.06551213025624243, "rouge2_recall_stderr": 0.0024545590859311856, "rougeL_fmeasure": 0.13935422276606838, "rougeL_fmeasure_stderr": 0.0019388171493750941, "rougeL_precision": 0.10194380290336473, "rougeL_precision_stderr": 0.0014645033080314515, "rougeL_recall": 0.23262788209228538, "rougeL_recall_stderr": 0.003415563039987054, "rougeLsum_fmeasure": 0.14683463366193603, "rougeLsum_fmeasure_stderr": 0.0021884358384104898, "rougeLsum_precision": 0.10728605862144046, "rougeLsum_precision_stderr": 0.0016390380124941084, "rougeLsum_recall": 0.24560801323409917, "rougeLsum_recall_stderr": 0.0038465518145899083}}, "1": {"article_DOC_summary": {"bleu": 1.5072643038464826, "bleu_stderr": 0.07581189432866778, "rouge1_fmeasure": 0.19377928742866415, "rouge1_fmeasure_stderr": 0.0029318979888479386, "rouge1_precision": 0.18034557721867206, "rouge1_precision_stderr": 0.003378966024270665, "rouge1_recall": 0.25165661054563454, "rouge1_recall_stderr": 0.004219363793342944, "rouge2_fmeasure": 0.03391705069355955, "rouge2_fmeasure_stderr": 0.001600827686299807, "rouge2_precision": 0.03174129360340327, "rouge2_precision_stderr": 0.0016980099731496725, "rouge2_recall": 0.04525380270312792, "rouge2_recall_stderr": 0.0021325289082326157, "rougeL_fmeasure": 0.14433798493238226, "rougeL_fmeasure_stderr": 0.0022145818855320775, "rougeL_precision": 0.1348915392123015, "rougeL_precision_stderr": 0.002633227095382269, "rougeL_recall": 0.18737022577489543, "rougeL_recall_stderr": 0.0031416529379445714, "rougeLsum_fmeasure": 0.14964383560266273, "rougeLsum_fmeasure_stderr": 0.002331396812470961, "rougeLsum_precision": 0.1387175358944472, "rougeLsum_precision_stderr": 0.0026549430237383285, "rougeLsum_recall": 0.19658579721494118, "rougeLsum_recall_stderr": 0.003531925441008354}}, "2": {"article_DOC_summary": {"bleu": 2.135517530489009, "bleu_stderr": 0.09395977154683074, "rouge1_fmeasure": 0.21899687156092093, "rouge1_fmeasure_stderr": 0.0031785578491904183, "rouge1_precision": 0.22068687589610897, "rouge1_precision_stderr": 0.003789951247255284, "rouge1_recall": 0.25078858709670043, "rouge1_recall_stderr": 0.0039415090299140245, "rouge2_fmeasure": 0.04511500976844758, "rouge2_fmeasure_stderr": 0.001986723782931247, "rouge2_precision": 0.04612059160046461, "rouge2_precision_stderr": 0.00216246272019255, "rouge2_recall": 0.05136702902319479, "rouge2_recall_stderr": 0.002315418839001681, "rougeL_fmeasure": 0.16528451206309597, "rougeL_fmeasure_stderr": 0.002448179539908828, "rougeL_precision": 0.1666654493818604, "rougeL_precision_stderr": 0.002931741970841584, "rougeL_recall": 0.19015440175110393, "rougeL_recall_stderr": 0.0030722371403492073, "rougeLsum_fmeasure": 0.16744440121325288, "rougeLsum_fmeasure_stderr": 0.0024991880630578747, "rougeLsum_precision": 0.1684803863889924, "rougeLsum_precision_stderr": 0.0029515578981164007, "rougeLsum_recall": 0.19342460055959376, "rougeLsum_recall_stderr": 0.0032586706636470127}}, "3": {"article_DOC_summary": {"bleu": 2.306953746485347, "bleu_stderr": 0.08640815655966855, "rouge1_fmeasure": 0.21441285271309635, "rouge1_fmeasure_stderr": 0.0034688162829816867, "rouge1_precision": 0.21755397454942801, "rouge1_precision_stderr": 0.0040197643665198, "rouge1_recall": 0.24395196225361002, "rouge1_recall_stderr": 0.0041819563633223285, "rouge2_fmeasure": 0.0455324464868029, "rouge2_fmeasure_stderr": 0.0019857349620838816, "rouge2_precision": 0.04608502015312911, "rouge2_precision_stderr": 0.0021286532637906437, "rouge2_recall": 0.05204692726094356, "rouge2_recall_stderr": 0.002308596850566173, "rougeL_fmeasure": 0.159773564475991, "rougeL_fmeasure_stderr": 0.002683043459624172, "rougeL_precision": 0.16247030281934602, "rougeL_precision_stderr": 0.0031264236020145395, "rougeL_recall": 0.18178458588619653, "rougeL_recall_stderr": 0.0031938440097234347, "rougeLsum_fmeasure": 0.16297000957614646, "rougeLsum_fmeasure_stderr": 0.002742910817454915, "rougeLsum_precision": 0.16495789870371907, "rougeLsum_precision_stderr": 0.0031401120246905712, "rougeLsum_recall": 0.18683194228664585, "rougeLsum_recall_stderr": 0.0034187194500331993}}, "4": {"article_DOC_summary": {"bleu": 0.21661324901347828, "bleu_stderr": 0.07403289458610106, "rouge1_fmeasure": 0.05269608047169612, "rouge1_fmeasure_stderr": 0.003098859012482462, "rouge1_precision": 0.05735338761210453, "rouge1_precision_stderr": 0.0035071456355189675, "rouge1_recall": 0.059021496533637884, "rouge1_recall_stderr": 0.0036712685384006774, "rouge2_fmeasure": 0.010458434372266581, "rouge2_fmeasure_stderr": 0.0011178203055454239, "rouge2_precision": 0.010781643643676288, "rouge2_precision_stderr": 0.0011947055672951296, "rouge2_recall": 0.01223156205613527, "rouge2_recall_stderr": 0.0013826375879066437, "rougeL_fmeasure": 0.03966278292494188, "rougeL_fmeasure_stderr": 0.0023832058820488743, "rougeL_precision": 0.04368080562194709, "rougeL_precision_stderr": 0.002762386413099355, "rougeL_recall": 0.04437717860143795, "rougeL_recall_stderr": 0.0028397295075813945, "rougeLsum_fmeasure": 0.04062836833112145, "rougeLsum_fmeasure_stderr": 0.0024442501537786706, "rougeLsum_precision": 0.04449065152016272, "rougeLsum_precision_stderr": 0.002797505308089145, "rougeLsum_recall": 0.04583958323736499, "rougeLsum_recall_stderr": 0.0029805207884232624}}, "5": {"article_DOC_summary": {"bleu": 1.5974310709826073e-21, "bleu_stderr": 1.3100992922519676e-15, "rouge1_fmeasure": 0.0028901136641309804, "rouge1_fmeasure_stderr": 0.0007628453825537149, "rouge1_precision": 0.003544629362742575, "rouge1_precision_stderr": 0.0009485977995739016, "rouge1_recall": 0.0037550498645595996, "rouge1_recall_stderr": 0.0011352337360716696, "rouge2_fmeasure": 0.0007826161154573944, "rouge2_fmeasure_stderr": 0.0003313480806525455, "rouge2_precision": 0.0006426177000561377, "rouge2_precision_stderr": 0.00026640442181651204, "rouge2_recall": 0.0013262549819153596, "rouge2_recall_stderr": 0.0006559432301667889, "rougeL_fmeasure": 0.002389689764554573, "rougeL_fmeasure_stderr": 0.0006340237380334276, "rougeL_precision": 0.002882853322968734, "rougeL_precision_stderr": 0.0007611952222133634, "rougeL_recall": 0.003139944055164728, "rougeL_recall_stderr": 0.0009749927515076235, "rougeLsum_fmeasure": 0.00241629675514098, "rougeLsum_fmeasure_stderr": 0.0006494890213851666, "rougeLsum_precision": 0.002902333778437242, "rougeLsum_precision_stderr": 0.0007672460026879215, "rougeLsum_recall": 0.0031769694879492582, "rougeLsum_recall_stderr": 0.0010086213776490023}}}} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..899a67f6e48e1566509caec6f138cbf3da616f0a --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.25518818499117035, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.020198929979072126 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.06424858701254539, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002540662604602708 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.24194520608398293, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005061888078063388 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.08713006240052369, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021163575152811052 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.029250897601530847, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0016159150314360084 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.11213489324648325, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0031431788617556264 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.03968682582435309, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012864152483850072 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.061593275111588554, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0024488839830169046 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2334118422350547, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0048787714459462665 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.08351980735691193, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001978263321627184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06070440310915625, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0024423712896334553 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.22684155273254836, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0046468714919551185 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.08198324173226955, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019772453103279655 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8afd18884ccbf3cd9859773d9bf4677059b90559 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.37770333810014706, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.036043662376688344 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08688444048786848, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0030316834829975468 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.25231836055300055, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004707834740663268 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11029589045724046, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026302378640336893 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.039888024685296904, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0019445292240548378 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.12320682798737367, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0031690473734229127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.050696350493553845, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001601172935799986 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.08067935525490526, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0028191568459744356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2400568623297162, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004443711432592927 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10289898300711169, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002349648102435476 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.081852061098128, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0028519921028834602 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.24140934346009968, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0044596632225301245 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10416303503161863, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0024045282990398016 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b0faab72aba03c845e002ef24656cf188f7acc13 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.3829018871876502, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.022682302500048476 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.09533308838612936, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0031918959249739934 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.275555629679834, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004727965781775394 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12236842718754035, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0030225942233901805 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.04438815526628443, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018839731929603098 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.13438684673727228, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032179061201762227 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05746066096268111, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0018708723435892534 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.08478065899787014, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0026474348060842394 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.25951028904964774, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004388247219710807 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11131192366136156, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0025600729892227623 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0870012622715152, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002779405775352401 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2612564683466686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004413760869764198 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11327636810594607, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00265570207245623 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..733c35e748c07149884abf8b8509dcc1b10a7742 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.49438795846003614, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03220480694116766 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.11102978394749591, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0037258981840235512 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.29324047704391387, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004929158066657188 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13661508516896614, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0033865188061428593 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.05489349981744968, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002411444050666127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1460318403788494, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0034080826963901003 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06675560716724324, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002152759177258534 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0983510025951489, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0031311578361653356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.27396911503915883, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004527803475867941 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.12331965784710698, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0028547373168783547 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.10118232307247621, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0032732569349238874 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2768278339741749, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004591647175413129 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1258905544050262, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0029667713047718076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b1ee2893eacf1b51b9e83f9079eee79304017dd --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5097531919668136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037846557427194026 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.12660445313737514, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004069129304355684 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.30584134713679634, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004938665654397721 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.15051290928420963, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0035962245168251086 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.06380697763757867, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0025616190740023713 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1531395402953802, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033953087678648087 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.07435510796633933, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002254816031736983 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.11139297723344309, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003440135146254942 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.28386785189457364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00447862149321582 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.13483336166859466, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0030388433299652576 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.11506907251192842, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0036096573854085593 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.287989283624223, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004567845010036214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.13843888381409075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0031866033556944046 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..24fb9e869f68c2529fc19ba9c81d51fefff438a7 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5493345438208933, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03156507075706306 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.13145167799279456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004208017486529403 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.31431356632933516, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004912309978055554 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.15523792842972664, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003705043227083823 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.06682039356587426, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0026734322118555824 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1593483911606193, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003488245506035694 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.07729991696510335, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002362198278661277 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.11521584250075316, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0035483893462582828 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2911824630349328, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00449934920077821 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.13867709585369095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0031268375342873036 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.11975844765120187, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0037510579951047463 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.29608094482109, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00457219328093606 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1428199679819932, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0032823766236227112 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fead82bca5365812570f1a4223a4e01966afd9b8 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.15983607856511065, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0025552324547594915 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22751168503589744, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0030756037547284122 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17176517291666785, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002329017824094656 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.03630672989355793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010196024456578525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.052475014894037275, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001428315830366114 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03932636998031157, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009987340137274396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12519971826411405, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0019474345957730163 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1828331784278453, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002460695997235263 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13522236510382263, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001715199895595129 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1479787200014008, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0024084994322680275 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21009537033174597, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0028681840700951855 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15863624899200707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021698433809270567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.2908341082810773, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08591697554573632 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..71a202cf9f6c2e23bb5470a14503427d59bc2754 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2074831448217252, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003524413251335538 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.1603567541864711, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002358874310291858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1500600406332044, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019241989868486221 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.048193206661683564, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0020379873823770994 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.030329812370434826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011532237790344163 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.02928962923956642, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000993776980137765 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.16593034599476272, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002952779433549126 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1263395938894177, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018438888158069336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.11776814806946785, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014763519012599205 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.19732631307428722, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0034023899501457837 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.15136030413475798, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0021926321158690864 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.14183665338988743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001800224756397767 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.8024864455011091, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09003925624860501 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c5a422ddef163dd0bf03c24a9ef7b70f3fffbe94 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.28596147697474456, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0040250344618331995 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.1993639467154353, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002664319477833243 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1947292213306491, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021918313174583916 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.08090209451722337, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0024314880977671276 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05022891340202324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013490243270254702 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.049895782057071895, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012048654152258728 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.22915557741483644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0033926337363906784 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.15804196664911024, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002108622273540943 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15366314427734978, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016931718188903863 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.2703085422322766, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003864247203591127 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.18781780308286916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025166845226557905 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.18325876600451932, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020600409806209663 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.9479127015875357, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07796616677370145 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dd15f737ad78402fc7ae994d0c319a85a3dfd4cd --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.26271995584299296, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004350571081047754 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.17195910512668672, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028930003693202434 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17047407122919256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024381191437714608 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.07716157594080365, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0025215008865493967 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.04592292328627891, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014011960521198414 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04580407896668463, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001240136840680315 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.2129040576929948, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0036832280907527527 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.13748507108736507, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023122364311355848 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1358429496475672, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019120675392480628 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.24858343528470517, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0041894189677485535 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.1614787885865035, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027151405967413447 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.16017361729618249, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022880671169300927 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.368296526557304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06715896563436964 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..48cec1ae8894e63fec45fd9a03f72c502a5ac575 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.09226808587029187, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0035951645035662144 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.05378214914394185, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0021922619377159835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05527677687040673, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002075334915597304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.028532578970617953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018587749480454441 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.01548143136364689, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010135388288224974 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.015865483046525946, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009374894411648104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07725545552877301, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0031119081702266804 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.04393223408477368, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018056545763819988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04511311086771814, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016959499204821453 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.08723167275286887, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003438050373646056 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.05010740359653005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0020446842446827317 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05162745766582887, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019361443382161162 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.03702293319391888, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.008280750551799155 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..25a16d859abdd87d41b0b77f99ec6495bb4327e7 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.01624748312843043, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017343102214544073 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.008973352017838115, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0009874748509305067 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.008715725437140403, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008716993431018379 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.005494197830878659, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000991214742335091 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.002558561959956041, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003855003265324329 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.002496646429114283, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00036357062963770885 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.014045751930703699, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015619179482028879 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.0075965284642917335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.000846021103064854 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0072939865630402565, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007275988588633965 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.015553958884576207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001678423693533492 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.008515152310636652, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009413374007224053 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008294111648957665, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008329997418165786 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 6.329549235648946e-15, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 6.98643056969485e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8fe31219456c71826bcb7efde3640348f94db2c4 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 3.4385508172058055, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.04264456511068813 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3745168009518315, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0054770341977520625 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.29958589610084396, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0033801849814915222 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.2767986999806955, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0032477671776355997 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.20297808777795995, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005522822982953256 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.10216853633525805, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019971383737075166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.097088577058388, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017064511604233982 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.29931908976323446, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005159547193434527 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2284780514712017, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023634097297777477 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.20603167711573117, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002072804178131474 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.34463645738569776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.005423162557411763 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.2657387698192473, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003165191193253911 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2467760500253612, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00298962192866607 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..71694612287b1ba8742d8900b6a2c13d140c75c1 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.578509877350227, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10981566633879294 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5290497807915124, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0034031031148912877 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.42449982582372703, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029796948529652516 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4430699483900025, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0024212750351764543 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.24509073269483966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026669698385618284 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19440129139428716, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021684401687693156 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2028107162678877, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020310061478079093 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.38881065801490833, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0029605217311991657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.31090092383731366, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002452100105085375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.324119071682265, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0020761088086072486 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.43465610390683307, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0032381620072339475 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.348028905687337, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00275713194366831 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.36322533452022043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002355969259137101 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0aa21797d0bdce7a245a7acfb74eefcb0c0f7994 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 13.973742807059493, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.21933451006645593 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5463581275257614, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0033109746053900106 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4599780026221357, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002866451132503217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4734319432946387, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002277730687363045 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.26984908227570326, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026581113600308783 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.22503111854291266, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022246934402712576 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23112076704320592, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002029026468778622 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.40847493701179866, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0029328594326759366 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3432828614570734, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024768383332246274 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.35285117962076507, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002056984438278394 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4597128692889795, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003192730444475765 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.38674562932968487, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002735034124705561 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3979842693641427, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002298871731116783 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f329a888a64b20389cdc9a64a333233f30f9a5db --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.533287312181702, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.18656720619703301 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5451265264934796, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003147288201601649 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4635452060391057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002820441604942112 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.478142190779417, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002235263328285793 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2714328734244194, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002583914662030242 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.23027470724704344, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022949497645592134 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23634052735419925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020601993739155743 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.40976835157500135, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002821032707227765 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3479477017249916, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002470400553707357 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3585015271693414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002048646341232902 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.46121561084482127, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003066679903899239 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3924920651465138, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027314711241234945 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.404637798062588, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002300343226701871 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e974a06875bff953f656d2b43f738cb831ea77f5 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.788126595166384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.18690040009176595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5482226243244983, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032124524377117303 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.46345359030928057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028329342309959572 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.48045922670165075, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002311569925954736 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.27446812632372075, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026299321970613995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2307927377768704, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002303932783753952 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23850438072833094, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002104880750018495 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.41268229797839345, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002885772570892295 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.34835409226615954, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024733395312169725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.36091724958116495, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021212744636126225 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.46563464741217186, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031424166236039795 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.39431106312311454, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027657116569750795 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.4083787027633123, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002378318389395946 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..35669a61e5dcbda114beee81bf391da1cab44e6e --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.816898114308659, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.2578943709075645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5463965076188966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031073173050291266 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.464786934938444, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027602961923400638 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4813258409710542, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0022155647522670184 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2741684381980198, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002611976915331882 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.23130694014233213, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002261839435649896 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23912604624839506, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002085017928004898 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4127791623423059, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0027864194329252517 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.35037713824014954, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002413674972879658 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3627614127879427, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002049692110741263 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4665920267581706, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0030279579394666637 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3970310588904633, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026745772949146518 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.41096147149083767, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022723564955560572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5bce7e6da54e0e349d63ba5520e026765dc5f3c0 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13515325881990387, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001988006386809129 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3082267363959881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004580722609646003 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18486846991878378, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026472370081744873 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0272859934174315, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000998594442193996 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06551213025624243, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0024545590859311856 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03791043043165935, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013771627135441805 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10194380290336473, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014645033080314515 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.23262788209228538, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003415563039987054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.13935422276606838, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019388171493750941 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10728605862144046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016390380124941084 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.24560801323409917, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038465518145899083 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.14683463366193603, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021884358384104898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.42436283894258, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0809652794130641 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..99f08ebb856784932f6b2731b5b50690578d72d5 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.18034557721867206, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.003378966024270665 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.25165661054563454, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004219363793342944 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19377928742866415, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029318979888479386 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03174129360340327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0016980099731496725 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.04525380270312792, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0021325289082326157 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03391705069355955, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001600827686299807 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1348915392123015, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002633227095382269 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.18737022577489543, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0031416529379445714 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14433798493238226, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022145818855320775 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1387175358944472, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0026549430237383285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.19658579721494118, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003531925441008354 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.14964383560266273, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002331396812470961 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5072643038464826, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07581189432866778 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c60b9194a317b4913205810ab78c9e902fbf116d --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.22068687589610897, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.003789951247255284 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.25078858709670043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0039415090299140245 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.21899687156092093, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0031785578491904183 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.04612059160046461, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00216246272019255 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.05136702902319479, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002315418839001681 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04511500976844758, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001986723782931247 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1666654493818604, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002931741970841584 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.19015440175110393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0030722371403492073 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.16528451206309597, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002448179539908828 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1684803863889924, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0029515578981164007 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.19342460055959376, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0032586706636470127 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16744440121325288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024991880630578747 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.135517530489009, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09395977154683074 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f82ebb590125edc0cc7a1ab75f12a91f5b7bcb8e --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.21755397454942801, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0040197643665198 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.24395196225361002, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0041819563633223285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.21441285271309635, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0034688162829816867 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.04608502015312911, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0021286532637906437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.05204692726094356, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002308596850566173 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0455324464868029, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0019857349620838816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.16247030281934602, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0031264236020145395 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.18178458588619653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0031938440097234347 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.159773564475991, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002683043459624172 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.16495789870371907, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0031401120246905712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.18683194228664585, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034187194500331993 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16297000957614646, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002742910817454915 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.306953746485347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08640815655966855 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6a0d18a5608752f9037eeebf2b2492affbad2c6a --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.05735338761210453, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0035071456355189675 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.059021496533637884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0036712685384006774 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05269608047169612, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.003098859012482462 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.010781643643676288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011947055672951296 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01223156205613527, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0013826375879066437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.010458434372266581, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011178203055454239 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.04368080562194709, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002762386413099355 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.04437717860143795, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0028397295075813945 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.03966278292494188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0023832058820488743 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.04449065152016272, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002797505308089145 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.04583958323736499, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0029805207884232624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04062836833112145, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024442501537786706 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.21661324901347828, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07403289458610106 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c9539330998c2c3b9435add83fb2063c78aff9d5 --- /dev/null +++ b/evaluation/generation/slim.checkpoints_2b855boscartasky_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.003544629362742575, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009485977995739016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0037550498645595996, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0011352337360716696 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0028901136641309804, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007628453825537149 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0006426177000561377, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00026640442181651204 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0013262549819153596, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0006559432301667889 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0007826161154573944, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0003313480806525455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.002882853322968734, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0007611952222133634 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.003139944055164728, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0009749927515076235 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.002389689764554573, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0006340237380334276 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002902333778437242, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0007672460026879215 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0031769694879492582, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0010086213776490023 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.00241629675514098, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006494890213851666 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5974310709826073e-21, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.3100992922519676e-15 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep2/scratch/project_462000241/muennighoff/checkpoints_2b855boscartasky/transformers,use_accelerate=True,tokenizer=/pfs/lustrep2/scratch/project_462000241/muennighoff/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_0.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..40eeffb69152a5662d6fa8201f6078c63efd9fa9 --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.01493311749093257,0 +anli_r2,acc,0.336,0.014944140233795023,0 +anli_r3,acc,0.3575,0.013840921245257796,0 +arc_challenge,acc,0.23378839590443687,0.012368225378507142,0 +arc_challenge,acc_norm,0.2525597269624573,0.012696728980207704,0 +arc_easy,acc,0.5353535353535354,0.010234104543411433,0 +arc_easy,acc_norm,0.48484848484848486,0.010255071794531502,0 +boolq,acc,0.6039755351681957,0.008553881336813417,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.2829428067523306,,1 +copa,acc,0.67,0.04725815626252607,0 +hellaswag,acc,0.35012945628360886,0.004760354191370866,0 +hellaswag,acc_norm,0.41983668591913964,0.004925233680511587,0 +piqa,acc,0.6871599564744287,0.010817714425701107,0 +piqa,acc_norm,0.6920565832426551,0.010770892367463671,0 +rte,acc,0.44765342960288806,0.02993107036293953,0 +sciq,acc,0.805,0.012535235623319327,0 +sciq,acc_norm,0.722,0.014174516461485254,0 +storycloze_2016,acc,0.6312132549438803,0.011157191913955236,0 +winogrande,acc,0.5019731649565904,0.014052376259225632,0 diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_0.json b/evaluation/rankeval/checkpoints_2b855boscartasky_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b47b7895b76906580cad7f0fd71b4f44191b3b04 --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.335, + "acc_stderr": 0.01493311749093257 + }, + "anli_r2": { + "acc": 0.336, + "acc_stderr": 0.014944140233795023 + }, + "anli_r3": { + "acc": 0.3575, + "acc_stderr": 0.013840921245257796 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.2829428067523306 + }, + "copa": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607 + }, + "hellaswag": { + "acc": 0.35012945628360886, + "acc_stderr": 0.004760354191370866, + "acc_norm": 0.41983668591913964, + "acc_norm_stderr": 0.004925233680511587 + }, + "rte": { + "acc": 0.44765342960288806, + "acc_stderr": 0.02993107036293953 + }, + "winogrande": { + "acc": 0.5019731649565904, + "acc_stderr": 0.014052376259225632 + }, + "storycloze_2016": { + "acc": 0.6312132549438803, + "acc_stderr": 0.011157191913955236 + }, + "boolq": { + "acc": 0.6039755351681957, + "acc_stderr": 0.008553881336813417 + }, + "arc_easy": { + "acc": 0.5353535353535354, + "acc_stderr": 0.010234104543411433, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.010255071794531502 + }, + "arc_challenge": { + "acc": 0.23378839590443687, + "acc_stderr": 0.012368225378507142, + "acc_norm": 0.2525597269624573, + "acc_norm_stderr": 0.012696728980207704 + }, + "sciq": { + "acc": 0.805, + "acc_stderr": 0.012535235623319327, + "acc_norm": 0.722, + "acc_norm_stderr": 0.014174516461485254 + }, + "piqa": { + "acc": 0.6871599564744287, + "acc_stderr": 0.010817714425701107, + "acc_norm": 0.6920565832426551, + "acc_norm_stderr": 0.010770892367463671 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_1.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..3ed25f23cf18368759ce53a0fc8891c7b1d098cf --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.347,0.015060472031706624,0 +anli_r2,acc,0.325,0.014818724459095527,0 +anli_r3,acc,0.36083333333333334,0.013869180252444864,0 +arc_challenge,acc,0.257679180887372,0.012780770562768405,0 +arc_challenge,acc_norm,0.28071672354948807,0.013131238126975574,0 +arc_easy,acc,0.5542929292929293,0.01019911818332299,0 +arc_easy,acc_norm,0.5223063973063973,0.010249568404555652,0 +boolq,acc,0.5929663608562691,0.008592562887068871,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.2842465753424657,,1 +copa,acc,0.62,0.04878317312145632,0 +hellaswag,acc,0.3514240191196973,0.0047643939851110305,0 +hellaswag,acc_norm,0.42859988050189207,0.004938643787869535,0 +piqa,acc,0.6871599564744287,0.010817714425701104,0 +piqa,acc_norm,0.6828073993471164,0.01085815545438087,0 +rte,acc,0.4548736462093863,0.029973636495415252,0 +sciq,acc,0.873,0.010534798620855743,0 +sciq,acc_norm,0.854,0.011171786285496497,0 +storycloze_2016,acc,0.6264029930518439,0.011186849693644694,0 +winogrande,acc,0.5122336227308603,0.01404827882040562,0 diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_1.json b/evaluation/rankeval/checkpoints_2b855boscartasky_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a210921c98e3d62bd6c75bfed0c47aec42e6de --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.347, + "acc_stderr": 0.015060472031706624 + }, + "anli_r2": { + "acc": 0.325, + "acc_stderr": 0.014818724459095527 + }, + "anli_r3": { + "acc": 0.36083333333333334, + "acc_stderr": 0.013869180252444864 + }, + "cb": { + "acc": 0.4642857142857143, + "acc_stderr": 0.06724777654937658, + "f1": 0.2842465753424657 + }, + "copa": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632 + }, + "hellaswag": { + "acc": 0.3514240191196973, + "acc_stderr": 0.0047643939851110305, + "acc_norm": 0.42859988050189207, + "acc_norm_stderr": 0.004938643787869535 + }, + "rte": { + "acc": 0.4548736462093863, + "acc_stderr": 0.029973636495415252 + }, + "winogrande": { + "acc": 0.5122336227308603, + "acc_stderr": 0.01404827882040562 + }, + "storycloze_2016": { + "acc": 0.6264029930518439, + "acc_stderr": 0.011186849693644694 + }, + "boolq": { + "acc": 0.5929663608562691, + "acc_stderr": 0.008592562887068871 + }, + "arc_easy": { + "acc": 0.5542929292929293, + "acc_stderr": 0.01019911818332299, + "acc_norm": 0.5223063973063973, + "acc_norm_stderr": 0.010249568404555652 + }, + "arc_challenge": { + "acc": 0.257679180887372, + "acc_stderr": 0.012780770562768405, + "acc_norm": 0.28071672354948807, + "acc_norm_stderr": 0.013131238126975574 + }, + "sciq": { + "acc": 0.873, + "acc_stderr": 0.010534798620855743, + "acc_norm": 0.854, + "acc_norm_stderr": 0.011171786285496497 + }, + "piqa": { + "acc": 0.6871599564744287, + "acc_stderr": 0.010817714425701104, + "acc_norm": 0.6828073993471164, + "acc_norm_stderr": 0.01085815545438087 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_2.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..67be0eac6a99dfb2c7ba7b865a5e91df29765951 --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.014842213153411245,0 +anli_r2,acc,0.338,0.01496596071022449,0 +anli_r3,acc,0.3358333333333333,0.01363926119093289,0 +arc_challenge,acc,0.25,0.012653835621466646,0 +arc_challenge,acc_norm,0.28071672354948807,0.013131238126975576,0 +arc_easy,acc,0.5547138047138047,0.010198171137873871,0 +arc_easy,acc_norm,0.5568181818181818,0.010193324837773498,0 +boolq,acc,0.590519877675841,0.008600549751320923,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.28584474885844746,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.35152360087631945,0.004764703145680279,0 +hellaswag,acc_norm,0.42620991834295957,0.004935143791573811,0 +piqa,acc,0.690424374319913,0.010786656752183345,0 +piqa,acc_norm,0.6849836779107725,0.010838072746240653,0 +rte,acc,0.47653429602888087,0.030063300411902652,0 +sciq,acc,0.88,0.010281328012747398,0 +sciq,acc_norm,0.873,0.010534798620855759,0 +storycloze_2016,acc,0.6264029930518439,0.011186849693644694,0 +winogrande,acc,0.5138121546961326,0.014047122916440419,0 diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_2.json b/evaluation/rankeval/checkpoints_2b855boscartasky_2.json new file mode 100644 index 0000000000000000000000000000000000000000..467a181cfa26bc618cde2a7242a55f0e7b9a2331 --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.327, + "acc_stderr": 0.014842213153411245 + }, + "anli_r2": { + "acc": 0.338, + "acc_stderr": 0.01496596071022449 + }, + "anli_r3": { + "acc": 0.3358333333333333, + "acc_stderr": 0.01363926119093289 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.28584474885844746 + }, + "copa": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237 + }, + "hellaswag": { + "acc": 0.35152360087631945, + "acc_stderr": 0.004764703145680279, + "acc_norm": 0.42620991834295957, + "acc_norm_stderr": 0.004935143791573811 + }, + "rte": { + "acc": 0.47653429602888087, + "acc_stderr": 0.030063300411902652 + }, + "winogrande": { + "acc": 0.5138121546961326, + "acc_stderr": 0.014047122916440419 + }, + "storycloze_2016": { + "acc": 0.6264029930518439, + "acc_stderr": 0.011186849693644694 + }, + "boolq": { + "acc": 0.590519877675841, + "acc_stderr": 0.008600549751320923 + }, + "arc_easy": { + "acc": 0.5547138047138047, + "acc_stderr": 0.010198171137873871, + "acc_norm": 0.5568181818181818, + "acc_norm_stderr": 0.010193324837773498 + }, + "arc_challenge": { + "acc": 0.25, + "acc_stderr": 0.012653835621466646, + "acc_norm": 0.28071672354948807, + "acc_norm_stderr": 0.013131238126975576 + }, + "sciq": { + "acc": 0.88, + "acc_stderr": 0.010281328012747398, + "acc_norm": 0.873, + "acc_norm_stderr": 0.010534798620855759 + }, + "piqa": { + "acc": 0.690424374319913, + "acc_stderr": 0.010786656752183345, + "acc_norm": 0.6849836779107725, + "acc_norm_stderr": 0.010838072746240653 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_3.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..090da2ebd2424447fb4d12ae6338c960d2d2343e --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.365,0.015231776226264902,0 +anli_r3,acc,0.3491666666666667,0.013767075395077252,0 +arc_challenge,acc,0.25341296928327645,0.012710896778378606,0 +arc_challenge,acc_norm,0.2815699658703072,0.013143376735009026,0 +arc_easy,acc,0.5618686868686869,0.010180937100600071,0 +arc_easy,acc_norm,0.5568181818181818,0.010193324837773495,0 +boolq,acc,0.5801223241590214,0.00863204550478174,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.2316017316017316,,1 +copa,acc,0.69,0.04648231987117316,0 +hellaswag,acc,0.3505277833100976,0.00476160130325889,0 +hellaswag,acc_norm,0.4260107548297152,0.004934846809827193,0 +piqa,acc,0.6931447225244831,0.010760295070580368,0 +piqa,acc_norm,0.6877040261153428,0.010812581599154424,0 +rte,acc,0.4620938628158845,0.030009848912529113,0 +sciq,acc,0.882,0.010206869264381795,0 +sciq,acc_norm,0.875,0.010463483381956722,0 +storycloze_2016,acc,0.6226616782469268,0.011209099452196189,0 +winogrande,acc,0.4964483030781373,0.014052131146915864,0 diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_3.json b/evaluation/rankeval/checkpoints_2b855boscartasky_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5c66c83c3f51d8b06460c86c21fb35de825e0fbf --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.318, + "acc_stderr": 0.014734079309311901 + }, + "anli_r2": { + "acc": 0.365, + "acc_stderr": 0.015231776226264902 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.013767075395077252 + }, + "cb": { + "acc": 0.44642857142857145, + "acc_stderr": 0.06703189227942398, + "f1": 0.2316017316017316 + }, + "copa": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316 + }, + "hellaswag": { + "acc": 0.3505277833100976, + "acc_stderr": 0.00476160130325889, + "acc_norm": 0.4260107548297152, + "acc_norm_stderr": 0.004934846809827193 + }, + "rte": { + "acc": 0.4620938628158845, + "acc_stderr": 0.030009848912529113 + }, + "winogrande": { + "acc": 0.4964483030781373, + "acc_stderr": 0.014052131146915864 + }, + "storycloze_2016": { + "acc": 0.6226616782469268, + "acc_stderr": 0.011209099452196189 + }, + "boolq": { + "acc": 0.5801223241590214, + "acc_stderr": 0.00863204550478174 + }, + "arc_easy": { + "acc": 0.5618686868686869, + "acc_stderr": 0.010180937100600071, + "acc_norm": 0.5568181818181818, + "acc_norm_stderr": 0.010193324837773495 + }, + "arc_challenge": { + "acc": 0.25341296928327645, + "acc_stderr": 0.012710896778378606, + "acc_norm": 0.2815699658703072, + "acc_norm_stderr": 0.013143376735009026 + }, + "sciq": { + "acc": 0.882, + "acc_stderr": 0.010206869264381795, + "acc_norm": 0.875, + "acc_norm_stderr": 0.010463483381956722 + }, + "piqa": { + "acc": 0.6931447225244831, + "acc_stderr": 0.010760295070580368, + "acc_norm": 0.6877040261153428, + "acc_norm_stderr": 0.010812581599154424 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_4.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..bee31270e66bbbf6f45acb6e0bed5e37cc5dec96 --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.341,0.014998131348402709,0 +anli_r2,acc,0.348,0.01507060460376841,0 +anli_r3,acc,0.3541666666666667,0.013811933499570958,0 +arc_challenge,acc,0.24232081911262798,0.012521593295800116,0 +arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0 +arc_easy,acc,0.5648148148148148,0.01017321643037092,0 +arc_easy,acc_norm,0.5648148148148148,0.010173216430370913,0 +boolq,acc,0.5899082568807339,0.008602512053254421,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.19999999999999998,,1 +copa,acc,0.67,0.04725815626252607,0 +hellaswag,acc,0.3505277833100976,0.004761601303258889,0 +hellaswag,acc_norm,0.4282015534754033,0.0049380686273494834,0 +piqa,acc,0.6855277475516867,0.010833009065106572,0 +piqa,acc_norm,0.6784548422198041,0.010897500107575652,0 +rte,acc,0.4584837545126354,0.029992535385373314,0 +sciq,acc,0.891,0.00985982840703719,0 +sciq,acc_norm,0.884,0.010131468138756995,0 +storycloze_2016,acc,0.6231961517904864,0.011205964516279667,0 +winogrande,acc,0.5090765588003157,0.014050170094497704,0 diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_4.json b/evaluation/rankeval/checkpoints_2b855boscartasky_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c7109ff09ed8334f11900f87f6b6a65a3102f3a7 --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.341, + "acc_stderr": 0.014998131348402709 + }, + "anli_r2": { + "acc": 0.348, + "acc_stderr": 0.01507060460376841 + }, + "anli_r3": { + "acc": 0.3541666666666667, + "acc_stderr": 0.013811933499570958 + }, + "cb": { + "acc": 0.42857142857142855, + "acc_stderr": 0.06672848092813058, + "f1": 0.19999999999999998 + }, + "copa": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607 + }, + "hellaswag": { + "acc": 0.3505277833100976, + "acc_stderr": 0.004761601303258889, + "acc_norm": 0.4282015534754033, + "acc_norm_stderr": 0.0049380686273494834 + }, + "rte": { + "acc": 0.4584837545126354, + "acc_stderr": 0.029992535385373314 + }, + "winogrande": { + "acc": 0.5090765588003157, + "acc_stderr": 0.014050170094497704 + }, + "storycloze_2016": { + "acc": 0.6231961517904864, + "acc_stderr": 0.011205964516279667 + }, + "boolq": { + "acc": 0.5899082568807339, + "acc_stderr": 0.008602512053254421 + }, + "arc_easy": { + "acc": 0.5648148148148148, + "acc_stderr": 0.01017321643037092, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.010173216430370913 + }, + "arc_challenge": { + "acc": 0.24232081911262798, + "acc_stderr": 0.012521593295800116, + "acc_norm": 0.27559726962457337, + "acc_norm_stderr": 0.013057169655761838 + }, + "sciq": { + "acc": 0.891, + "acc_stderr": 0.00985982840703719, + "acc_norm": 0.884, + "acc_norm_stderr": 0.010131468138756995 + }, + "piqa": { + "acc": 0.6855277475516867, + "acc_stderr": 0.010833009065106572, + "acc_norm": 0.6784548422198041, + "acc_norm_stderr": 0.010897500107575652 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_5.csv b/evaluation/rankeval/checkpoints_2b855boscartasky_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..8f7b1f190f1ce29512c9ffbd8b9e4a25b2a8a8bc --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.356,0.015149042659306623,0 +anli_r2,acc,0.355,0.015139491543780532,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.257679180887372,0.012780770562768395,0 +arc_challenge,acc_norm,0.2781569965870307,0.013094469919538812,0 +arc_easy,acc,0.5618686868686869,0.010180937100600073,0 +arc_easy,acc_norm,0.5589225589225589,0.01018829322104055,0 +boolq,acc,0.5770642201834862,0.008640558744656428,1 +cb,acc,0.5535714285714286,0.06703189227942394,1 +cb,f1,0.3074074074074074,,1 +copa,acc,0.67,0.04725815626252607,0 +hellaswag,acc,0.3536148177653854,0.004771143074426132,0 +hellaswag,acc_norm,0.4255128460466043,0.00493410077448122,0 +piqa,acc,0.6838955386289445,0.010848148455700455,0 +piqa,acc_norm,0.6789989118607181,0.010892641574707904,0 +rte,acc,0.48014440433212996,0.0300727231673172,0 +sciq,acc,0.897,0.0096168333396958,0 +sciq,acc_norm,0.893,0.009779910359847169,0 +storycloze_2016,acc,0.6264029930518439,0.011186849693644694,0 +winogrande,acc,0.500394632991318,0.014052481306049516,0 diff --git a/evaluation/rankeval/checkpoints_2b855boscartasky_5.json b/evaluation/rankeval/checkpoints_2b855boscartasky_5.json new file mode 100644 index 0000000000000000000000000000000000000000..82f5ed5fc0bcca786f95cdec00c4f924e2bea967 --- /dev/null +++ b/evaluation/rankeval/checkpoints_2b855boscartasky_5.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.356, + "acc_stderr": 0.015149042659306623 + }, + "anli_r2": { + "acc": 0.355, + "acc_stderr": 0.015139491543780532 + }, + "anli_r3": { + "acc": 0.3433333333333333, + "acc_stderr": 0.01371263383046586 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942394, + "f1": 0.3074074074074074 + }, + "copa": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607 + }, + "hellaswag": { + "acc": 0.3536148177653854, + "acc_stderr": 0.004771143074426132, + "acc_norm": 0.4255128460466043, + "acc_norm_stderr": 0.00493410077448122 + }, + "rte": { + "acc": 0.48014440433212996, + "acc_stderr": 0.0300727231673172 + }, + "winogrande": { + "acc": 0.500394632991318, + "acc_stderr": 0.014052481306049516 + }, + "storycloze_2016": { + "acc": 0.6264029930518439, + "acc_stderr": 0.011186849693644694 + }, + "boolq": { + "acc": 0.5770642201834862, + "acc_stderr": 0.008640558744656428 + }, + "arc_easy": { + "acc": 0.5618686868686869, + "acc_stderr": 0.010180937100600073, + "acc_norm": 0.5589225589225589, + "acc_norm_stderr": 0.01018829322104055 + }, + "arc_challenge": { + "acc": 0.257679180887372, + "acc_stderr": 0.012780770562768395, + "acc_norm": 0.2781569965870307, + "acc_norm_stderr": 0.013094469919538812 + }, + "sciq": { + "acc": 0.897, + "acc_stderr": 0.0096168333396958, + "acc_norm": 0.893, + "acc_norm_stderr": 0.009779910359847169 + }, + "piqa": { + "acc": 0.6838955386289445, + "acc_stderr": 0.010848148455700455, + "acc_norm": 0.6789989118607181, + "acc_norm_stderr": 0.010892641574707904 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file