huanghuayu commited on
Commit
ef248b8
·
1 Parent(s): 6782572

add implementation

Browse files
Files changed (1) hide show
  1. multiclass_brier_score.py +59 -41
multiclass_brier_score.py CHANGED
@@ -11,50 +11,58 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """TODO: Add a description here."""
 
 
 
15
 
16
  import evaluate
17
  import datasets
18
 
19
 
20
- # TODO: Add BibTeX citation
21
- _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
 
 
 
26
  }
27
  """
28
 
29
- # TODO: Add description of the module here
30
- _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
32
  """
33
 
34
 
35
- # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
37
- Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
  Examples should be written in doctest format, and should illustrate how
48
  to use the function.
49
 
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
- >>> print(results)
53
- {'accuracy': 1.0}
54
- """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 
 
 
 
 
 
 
 
58
 
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
@@ -71,25 +79,35 @@ class multiclass_brier_score(evaluate.Metric):
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
  }),
77
- # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
 
89
- def _compute(self, predictions, references):
90
- """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+
15
+ """brier_score metric for multiclass problem."""
16
+
17
+ import numpy as np
18
 
19
  import evaluate
20
  import datasets
21
 
22
 
23
+ _CITATION = """
24
+ @article{brier1950verification,
25
+ title={Verification of forecasts expressed in terms of probability},
26
+ author={Brier, Glenn W},
27
+ journal={Monthly weather review},
28
+ volume={78},
29
+ number={1},
30
+ pages={1--3},
31
+ year={1950}
32
  }
33
  """
34
 
35
+ _DESCRIPTION = """
36
+ Measure to compare true observed labels with predicted probabilities in multiclass classification tasks.
 
37
  """
38
 
39
 
 
40
  _KWARGS_DESCRIPTION = """
41
+ Multiclass Brier Score: Measure to compare true observed labels with predicted probabilities in multiclass classification tasks.
42
  Args:
43
+ pred_probs: array-like of shape (n_sample, m_classes).
44
+ references: array-like array of shape (n_sample,).
 
 
45
  Returns:
46
+ brier_score: float, average brier score over all samples.
 
47
  Examples:
48
  Examples should be written in doctest format, and should illustrate how
49
  to use the function.
50
 
51
+ >>> brier_metric = multiclass_brier_score()
52
+ >>> brier_score = brier_metric.compute(pred_probs=[[0.0, 1.0, 0.0]], references=[1])
53
+ >>> print(brier_score)
54
+ {'brier_score': 0.0}
 
55
 
56
+ >>> brier_metric = multiclass_brier_score()
57
+ >>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8]], references=[2])
58
+ >>> print(round(brier_score['brier_score'], 2))
59
+ 0.06
60
+
61
+ >>> brier_metric = multiclass_brier_score()
62
+ >>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8], [0.0, 1.0, 0.0]], references=[2, 1])
63
+ >>> print(round(brier_score['brier_score'], 2))
64
+ 0.03
65
+ """
66
 
67
 
68
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 
79
  inputs_description=_KWARGS_DESCRIPTION,
80
  # This defines the format of each prediction and reference
81
  features=datasets.Features({
82
+ 'pred_probs': datasets.Sequence(datasets.Value("float")),
83
+ 'references': datasets.Value('int32'),
84
  }),
 
 
85
  # Additional links to the codebase or references
86
+ #codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
87
+ reference_urls=["https://search.r-project.org/CRAN/refmans/mlr3measures/html/mbrier.html"]
88
  )
89
 
 
 
 
 
90
 
91
+ def _compute(self, pred_probs: np.ndarray, references: np.ndarray):
92
+ """
93
+ brier_score = 1/n * sum_{i=1}^n sum_{j=1}^m (y_{ij} - p{ij})^2
94
+ Args:
95
+ pred_probs: numpy array of shape (n, m) where n is the number of samples and m is the number of classes
96
+ references: numpy array of shape (n,) where n is the number of samples
97
+ """
98
+ assert len(pred_probs) == len(references), "The length of the predictions and references should be the same"
99
+ pred_probs = np.array(pred_probs)
100
+ n = len(references)
101
+ m = pred_probs.shape[1]
102
+ # generate one-hot encoding for the references
103
+ references_onehot = np.zeros((n, m))
104
+ references_onehot[np.arange(n), references] = 1 # shape: (n, m)
105
+ brier_score = np.sum((references_onehot - pred_probs)**2) / float(n)
106
  return {
107
+ "brier_score": brier_score,
108
+ }
109
+
110
+
111
+ if __name__ == "__main__":
112
+ import doctest
113
+ doctest.testmod()