|
|
|
title: "[Evaluation Name]" |
|
|
|
summary: > |
|
Brief description of the evaluation approach, its purpose, and scope. |
|
|
|
metadata: |
|
authors: [] |
|
maintainers: [] |
|
creation_date: "" |
|
last_review_date: "" |
|
next_review_date: "" |
|
version_compatibility: [] |
|
repository_link: "" |
|
paper_link: "" |
|
|
|
evaluation_design: |
|
motivation: |
|
scientific_needs: "" |
|
approach_justification: "" |
|
expected_benefits: "" |
|
tradeoffs: "" |
|
|
|
type_and_structure: |
|
type: "" |
|
structure: "" |
|
timeline: "" |
|
key_design_decisions: [] |
|
design_process: |
|
stakeholder_consultation: "" |
|
pilot_studies: [] |
|
validation_approaches: [] |
|
|
|
stakeholders_and_resources: |
|
target_users: [] |
|
required_expertise: [] |
|
resource_requirements: [] |
|
cost_considerations: "" |
|
|
|
estimand: |
|
target_construct: |
|
primary_capability: "" |
|
measurement_type: "" |
|
relationship_to_applications: "" |
|
theoretical_framework: "" |
|
|
|
scope_and_limitations: |
|
coverage: "" |
|
excluded_capabilities: [] |
|
known_blind_spots: [] |
|
theoretical_limitations: [] |
|
|
|
assessment_components: |
|
test_set: |
|
data_sources: [] |
|
sampling_methodology: "" |
|
known_biases: [] |
|
approach_to_duplicates: "" |
|
data_quality: "" |
|
|
|
challenge: |
|
design_principles: [] |
|
task_selection_criteria: [] |
|
difficulty_progression: "" |
|
time_constraints: "" |
|
|
|
red_teaming: |
|
probing_methodology: "" |
|
coverage_strategy: "" |
|
adversarial_approach: "" |
|
safety_considerations: "" |
|
|
|
deployment_study: |
|
environment_characteristics: "" |
|
integration_points: [] |
|
success_criteria: [] |
|
monitoring_approach: "" |
|
|
|
estimator: |
|
evaluation_protocol: |
|
methodology: "" |
|
control_measures: [] |
|
handling_random_components: "" |
|
reproducibility_requirements: "" |
|
|
|
metrics: |
|
primary_metrics: [] |
|
aggregation_methodology: "" |
|
task_weightings: {} |
|
performance_bounds: {} |
|
connection_to_outcomes: "" |
|
|
|
metric_details: |
|
- name: "" |
|
definition: "" |
|
implementation: "" |
|
edge_cases: [] |
|
statistical_properties: "" |
|
baseline_values: {} |
|
failure_modes: [] |
|
|
|
technical_framework: |
|
implementation_requirements: [] |
|
time_constraints: "" |
|
dependencies: [] |
|
authentication_needs: "" |
|
|
|
constraints_and_rules: |
|
allowed_resources: [] |
|
permitted_approaches: [] |
|
optimization_constraints: [] |
|
ethical_boundaries: [] |
|
|
|
estimate: |
|
required_reporting: |
|
essential_metrics: [] |
|
results_disaggregation: "" |
|
uncertainty_quantification: "" |
|
performance_variation: "" |
|
resource_usage_reporting: "" |
|
|
|
reproducibility_information: |
|
documentation_requirements: [] |
|
environment_specifications: "" |
|
randomization_handling: "" |
|
output_standardization: "" |
|
|
|
results_communication: |
|
visualization: |
|
recommended_plots: [] |
|
standardized_formats: [] |
|
key_comparisons: [] |
|
|
|
leaderboard_guidelines: |
|
submission_process: "" |
|
required_metadata: [] |
|
|
|
known_issues_and_limitations: |
|
validity_concerns: |
|
construct_validity: "" |
|
gaming_possibilities: "" |
|
stability_considerations: "" |
|
temporal_validity: "" |
|
|
|
practical_limitations: |
|
resource_constraints: "" |
|
scalability_issues: "" |
|
cost_factors: "" |
|
time_boundaries: "" |
|
|
|
bias_and_fairness: |
|
known_biases: [] |
|
representation_issues: "" |
|
potential_impacts: "" |
|
mitigation_approaches: [] |
|
|
|
version_and_maintenance: |
|
version_information: |
|
version: "" |
|
release_date: "" |
|
change_history: [] |
|
update_plans: "" |
|
|
|
maintenance_protocol: |
|
update_frequency: "" |
|
deprecation_policy: "" |
|
issue_reporting: "" |
|
community_involvement: "" |
|
criteria_for_updates: [] |
|
breaking_change_policy: "" |
|
backwards_compatibility: "" |
|
migration_guides: "" |
|
|
|
citation_and_usage: |
|
citation_information: |
|
recommended_citation: "" |
|
related_publications: [] |
|
licensing_details: "" |
|
|
|
usage_guidelines: |
|
recommended_applications: [] |
|
inappropriate_uses: [] |
|
implementation_best_practices: "" |
|
ethical_considerations: "" |
|
|
|
additional_notes: |
|
related_evaluations: [] |
|
future_directions: "" |