|
import logging
|
|
from datasets import load_dataset
|
|
import torch
|
|
|
|
from sentence_transformers import SentenceTransformer
|
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
|
|
from sentence_transformers.util import mine_hard_negatives
|
|
|
|
|
|
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
|
|
|
|
|
|
def main():
|
|
eval_batch_size = 16
|
|
|
|
|
|
logging.info("Read the trivia-qa reranking dataset")
|
|
full_dataset = load_dataset("sentence-transformers/trivia-qa", split="train")
|
|
eval_dataset = full_dataset.select(range(1000))
|
|
logging.info(eval_dataset)
|
|
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
|
|
hard_eval_dataset = mine_hard_negatives(
|
|
eval_dataset,
|
|
embedding_model,
|
|
corpus=full_dataset["answer"],
|
|
num_negatives=30,
|
|
batch_size=4096,
|
|
include_positives=True,
|
|
output_format="n-tuple",
|
|
use_faiss=True,
|
|
)
|
|
logging.info(hard_eval_dataset)
|
|
|
|
|
|
|
|
|
|
samples = [
|
|
{
|
|
"query": sample["query"],
|
|
"positive": [sample["answer"]],
|
|
"documents": [sample[column_name] for column_name in hard_eval_dataset.column_names[2:]],
|
|
}
|
|
for sample in hard_eval_dataset
|
|
]
|
|
realistic_reranking_evaluator = CrossEncoderRerankingEvaluator(
|
|
samples=samples,
|
|
batch_size=eval_batch_size,
|
|
name="trivia-qa-dev-realistic",
|
|
always_rerank_positives=False,
|
|
show_progress_bar=True,
|
|
)
|
|
evaluation_reranking_evaluator = CrossEncoderRerankingEvaluator(
|
|
samples=samples,
|
|
batch_size=eval_batch_size,
|
|
name="trivia-qa-dev-evaluation",
|
|
always_rerank_positives=True,
|
|
show_progress_bar=True,
|
|
)
|
|
|
|
for model_name in [
|
|
"tomaarsen/reranker-ModernBERT-base-trivia-qa-bce",
|
|
"cross-encoder/ms-marco-MiniLM-L6-v2",
|
|
"jinaai/jina-reranker-v1-tiny-en",
|
|
"jinaai/jina-reranker-v1-turbo-en",
|
|
"jinaai/jina-reranker-v2-base-multilingual",
|
|
"BAAI/bge-reranker-base",
|
|
"BAAI/bge-reranker-large",
|
|
"BAAI/bge-reranker-v2-m3",
|
|
"mixedbread-ai/mxbai-rerank-xsmall-v1",
|
|
"mixedbread-ai/mxbai-rerank-base-v1",
|
|
"mixedbread-ai/mxbai-rerank-large-v1",
|
|
|
|
|
|
"Alibaba-NLP/gte-reranker-modernbert-base",
|
|
]:
|
|
|
|
logging.info(f"Loading {model_name} model")
|
|
|
|
cross_encoder = CrossEncoder(model_name, model_kwargs={"torch_dtype": torch.bfloat16}, trust_remote_code=True)
|
|
|
|
|
|
logging.info(f"Evaluating {model_name}")
|
|
print(model_name)
|
|
print(realistic_reranking_evaluator(cross_encoder))
|
|
print(evaluation_reranking_evaluator(cross_encoder))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
"""
|
|
2025-03-28 10:17:20 - Evaluating tomaarsen/reranker-ModernBERT-base-trivia-qa-bce
|
|
2025-03-28 10:17:20 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-realistic dataset:
|
|
2025-03-28 10:21:06 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:21:06 - Base -> Reranked
|
|
2025-03-28 10:21:06 - MAP: 46.91 -> 65.07
|
|
2025-03-28 10:21:06 - MRR@10: 46.19 -> 65.33
|
|
2025-03-28 10:21:06 - NDCG@10: 52.31 -> 69.21
|
|
2025-03-28 10:21:06 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-evaluation dataset:
|
|
2025-03-28 10:25:39 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:25:39 - Base -> Reranked
|
|
2025-03-28 10:25:39 - MAP: 46.91 -> 76.18
|
|
2025-03-28 10:25:39 - MRR@10: 46.19 -> 76.46
|
|
2025-03-28 10:25:39 - NDCG@10: 52.31 -> 81.16
|
|
|
|
2025-03-28 10:25:39 - Loading cross-encoder/ms-marco-MiniLM-L6-v2 model
|
|
2025-03-28 10:25:40 - Use pytorch device: cuda
|
|
2025-03-28 10:25:40 - Evaluating cross-encoder/ms-marco-MiniLM-L6-v2
|
|
2025-03-28 10:25:40 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-realistic dataset:
|
|
2025-03-28 10:26:08 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:26:08 - Base -> Reranked
|
|
2025-03-28 10:26:08 - MAP: 46.91 -> 59.97
|
|
2025-03-28 10:26:08 - MRR@10: 46.19 -> 59.72
|
|
2025-03-28 10:26:08 - NDCG@10: 52.31 -> 64.26
|
|
2025-03-28 10:26:08 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-evaluation dataset:
|
|
2025-03-28 10:26:41 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:26:41 - Base -> Reranked
|
|
2025-03-28 10:26:41 - MAP: 46.91 -> 65.99
|
|
2025-03-28 10:26:41 - MRR@10: 46.19 -> 65.41
|
|
2025-03-28 10:26:41 - NDCG@10: 52.31 -> 70.82
|
|
|
|
2025-03-28 10:26:41 - Loading jinaai/jina-reranker-v1-tiny-en model
|
|
2025-03-28 10:26:43 - Use pytorch device: cuda
|
|
2025-03-28 10:26:44 - Evaluating jinaai/jina-reranker-v1-tiny-en
|
|
2025-03-28 10:26:44 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-realistic dataset:
|
|
2025-03-28 10:28:49 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:28:49 - Base -> Reranked
|
|
2025-03-28 10:28:49 - MAP: 46.91 -> 59.46
|
|
2025-03-28 10:28:49 - MRR@10: 46.19 -> 59.57
|
|
2025-03-28 10:28:49 - NDCG@10: 52.31 -> 64.11
|
|
2025-03-28 10:28:49 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-evaluation dataset:
|
|
2025-03-28 10:30:53 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:30:53 - Base -> Reranked
|
|
2025-03-28 10:30:53 - MAP: 46.91 -> 65.14
|
|
2025-03-28 10:30:53 - MRR@10: 46.19 -> 65.69
|
|
2025-03-28 10:30:53 - NDCG@10: 52.31 -> 70.47
|
|
|
|
2025-03-28 10:33:24 - Evaluating jinaai/jina-reranker-v1-turbo-en
|
|
2025-03-28 10:33:24 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-realistic dataset:
|
|
2025-03-28 10:36:16 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:36:16 - Base -> Reranked
|
|
2025-03-28 10:36:16 - MAP: 46.91 -> 59.21
|
|
2025-03-28 10:36:16 - MRR@10: 46.19 -> 59.03
|
|
2025-03-28 10:36:16 - NDCG@10: 52.31 -> 63.75
|
|
2025-03-28 10:36:16 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-evaluation dataset:
|
|
2025-03-28 10:39:09 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:39:09 - Base -> Reranked
|
|
2025-03-28 10:39:09 - MAP: 46.91 -> 65.39
|
|
2025-03-28 10:39:09 - MRR@10: 46.19 -> 65.04
|
|
2025-03-28 10:39:09 - NDCG@10: 52.31 -> 70.70
|
|
|
|
2025-03-28 10:54:00 - Evaluating jinaai/jina-reranker-v2-base-multilingual
|
|
2025-03-28 10:54:00 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-realistic dataset:
|
|
2025-03-28 10:56:49 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 10:56:49 - Base -> Reranked
|
|
2025-03-28 10:56:49 - MAP: 46.91 -> 61.59
|
|
2025-03-28 10:56:49 - MRR@10: 46.19 -> 61.63
|
|
2025-03-28 10:56:49 - NDCG@10: 52.31 -> 66.16
|
|
2025-03-28 10:56:49 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-evaluation dataset:
|
|
2025-03-28 11:00:14 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:00:14 - Base -> Reranked
|
|
2025-03-28 11:00:14 - MAP: 46.91 -> 69.46
|
|
2025-03-28 11:00:14 - MRR@10: 46.19 -> 69.47
|
|
2025-03-28 11:00:14 - NDCG@10: 52.31 -> 74.74
|
|
|
|
2025-03-28 11:14:39 - Evaluating BAAI/bge-reranker-base
|
|
BAAI/bge-reranker-base
|
|
2025-03-28 11:14:39 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-realistic dataset:
|
|
2025-03-28 11:15:42 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:15:42 - Base -> Reranked
|
|
2025-03-28 11:15:42 - MAP: 46.91 -> 52.59
|
|
2025-03-28 11:15:42 - MRR@10: 46.19 -> 54.37
|
|
2025-03-28 11:15:42 - NDCG@10: 52.31 -> 59.41
|
|
{'trivia-qa-dev-realistic_map': 0.5259323270998224, 'trivia-qa-dev-realistic_mrr@10': 0.5436916666666666, 'trivia-qa-dev-realistic_ndcg@10': 0.5940858318692244, 'trivia-qa-dev-realistic_base_map': 0.46913299504084743, 'trivia-qa-dev-realistic_base_mrr@10': 0.4618571428571429, 'trivia-qa-dev-realistic_base_ndcg@10': 0.5231399731095658}
|
|
2025-03-28 11:15:42 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-evaluation dataset:
|
|
2025-03-28 11:16:58 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:16:58 - Base -> Reranked
|
|
2025-03-28 11:16:58 - MAP: 46.91 -> 59.04
|
|
2025-03-28 11:16:58 - MRR@10: 46.19 -> 62.72
|
|
2025-03-28 11:16:58 - NDCG@10: 52.31 -> 66.61
|
|
{'trivia-qa-dev-evaluation_map': 0.590369907985238, 'trivia-qa-dev-evaluation_mrr@10': 0.6271603174603174, 'trivia-qa-dev-evaluation_ndcg@10': 0.6661314116591164, 'trivia-qa-dev-evaluation_base_map': 0.46913299504084743, 'trivia-qa-dev-evaluation_base_mrr@10': 0.4618571428571429, 'trivia-qa-dev-evaluation_base_ndcg@10': 0.5231399731095658}
|
|
|
|
2025-03-28 11:17:01 - Evaluating BAAI/bge-reranker-large
|
|
BAAI/bge-reranker-large
|
|
2025-03-28 11:17:01 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-realistic dataset:
|
|
2025-03-28 11:19:48 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:19:48 - Base -> Reranked
|
|
2025-03-28 11:19:48 - MAP: 46.91 -> 55.68
|
|
2025-03-28 11:19:48 - MRR@10: 46.19 -> 56.87
|
|
2025-03-28 11:19:48 - NDCG@10: 52.31 -> 61.76
|
|
{'trivia-qa-dev-realistic_map': 0.5567810015278374, 'trivia-qa-dev-realistic_mrr@10': 0.5687162698412699, 'trivia-qa-dev-realistic_ndcg@10': 0.6176060985342933, 'trivia-qa-dev-realistic_base_map': 0.46913299504084743, 'trivia-qa-dev-realistic_base_mrr@10': 0.4618571428571429, 'trivia-qa-dev-realistic_base_ndcg@10': 0.5231399731095658}
|
|
2025-03-28 11:19:48 - CrossEncoderRerankingEvaluator: Evaluating the model on the trivia-qa-dev-evaluation dataset:
|
|
2025-03-28 11:23:10 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:23:10 - Base -> Reranked
|
|
2025-03-28 11:23:10 - MAP: 46.91 -> 62.93
|
|
2025-03-28 11:23:10 - MRR@10: 46.19 -> 65.24
|
|
2025-03-28 11:23:10 - NDCG@10: 52.31 -> 69.92
|
|
{'trivia-qa-dev-evaluation_map': 0.6292598276500135, 'trivia-qa-dev-evaluation_mrr@10': 0.6523801587301588, 'trivia-qa-dev-evaluation_ndcg@10': 0.6992497211715496, 'trivia-qa-dev-evaluation_base_map': 0.46913299504084743, 'trivia-qa-dev-evaluation_base_mrr@10': 0.4618571428571429, 'trivia-qa-dev-evaluation_base_ndcg@10': 0.5231399731095658}
|
|
|
|
bge-reranker-v2-m3
|
|
2025-03-28 11:33:42 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:33:42 - Base -> Reranked
|
|
2025-03-28 11:33:42 - MAP: 46.91 -> 59.46
|
|
2025-03-28 11:33:42 - MRR@10: 46.19 -> 60.49
|
|
2025-03-28 11:33:42 - NDCG@10: 52.31 -> 64.85
|
|
{'trivia-qa-dev-realistic_map': 0.5945974714456489, 'trivia-qa-dev-realistic_mrr@10': 0.6049440476190477, 'trivia-qa-dev-realistic_ndcg@10': 0.6485089522801432, 'trivia-qa-dev-realistic_base_map': 0.46913299504084743, 'trivia-qa-dev-realistic_base_mrr@10': 0.4618571428571429, 'trivia-qa-dev-realistic_base_ndcg@10': 0.523139973109566}
|
|
2025-03-28 11:33:42 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-evaluation dataset:
|
|
2025-03-28 11:41:37 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:41:37 - Base -> Reranked
|
|
2025-03-28 11:41:37 - MAP: 46.91 -> 67.12
|
|
2025-03-28 11:41:37 - MRR@10: 46.19 -> 68.82
|
|
2025-03-28 11:41:37 - NDCG@10: 52.31 -> 73.29
|
|
{'gooaq-dev-evaluation_map': 0.6712088650084718, 'gooaq-dev-evaluation_mrr@10': 0.6881884920634921, 'gooaq-dev-evaluation_ndcg@10': 0.7329280539251892, 'gooaq-dev-evaluation_base_map': 0.46913299504084743, 'gooaq-dev-evaluation_base_mrr@10': 0.4618571428571429, 'gooaq-dev-evaluation_base_ndcg@10': 0.523139973109566}
|
|
|
|
2025-03-28 11:41:38 - Evaluating mixedbread-ai/mxbai-rerank-xsmall-v1
|
|
mixedbread-ai/mxbai-rerank-xsmall-v1
|
|
2025-03-28 11:41:38 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-realistic dataset:
|
|
2025-03-28 11:42:59 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:42:59 - Base -> Reranked
|
|
2025-03-28 11:42:59 - MAP: 46.91 -> 58.07
|
|
2025-03-28 11:42:59 - MRR@10: 46.19 -> 57.79
|
|
2025-03-28 11:42:59 - NDCG@10: 52.31 -> 62.53
|
|
{'gooaq-dev-realistic_map': 0.5806904579481978, 'gooaq-dev-realistic_mrr@10': 0.5778678571428572, 'gooaq-dev-realistic_ndcg@10': 0.6252806166705941, 'gooaq-dev-realistic_base_map': 0.46913299504084743, 'gooaq-dev-realistic_base_mrr@10': 0.4618571428571429, 'gooaq-dev-realistic_base_ndcg@10': 0.523139973109566}
|
|
2025-03-28 11:42:59 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-evaluation dataset:
|
|
2025-03-28 11:44:37 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:44:37 - Base -> Reranked
|
|
2025-03-28 11:44:37 - MAP: 46.91 -> 63.42
|
|
2025-03-28 11:44:37 - MRR@10: 46.19 -> 63.23
|
|
2025-03-28 11:44:37 - NDCG@10: 52.31 -> 68.38
|
|
{'gooaq-dev-evaluation_map': 0.6341866885097522, 'gooaq-dev-evaluation_mrr@10': 0.6323373015873017, 'gooaq-dev-evaluation_ndcg@10': 0.683834674484044, 'gooaq-dev-evaluation_base_map': 0.46913299504084743, 'gooaq-dev-evaluation_base_mrr@10': 0.4618571428571429, 'gooaq-dev-evaluation_base_ndcg@10': 0.523139973109566}
|
|
|
|
2025-03-28 11:44:38 - Evaluating mixedbread-ai/mxbai-rerank-base-v1
|
|
mixedbread-ai/mxbai-rerank-base-v1
|
|
2025-03-28 11:44:38 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-realistic dataset:
|
|
2025-03-28 11:47:02 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:47:02 - Base -> Reranked
|
|
2025-03-28 11:47:02 - MAP: 46.91 -> 55.81
|
|
2025-03-28 11:47:02 - MRR@10: 46.19 -> 55.68
|
|
2025-03-28 11:47:02 - NDCG@10: 52.31 -> 60.99
|
|
{'gooaq-dev-realistic_map': 0.5580880756399167, 'gooaq-dev-realistic_mrr@10': 0.5567904761904762, 'gooaq-dev-realistic_ndcg@10': 0.6099184869749001, 'gooaq-dev-realistic_base_map': 0.46913299504084743, 'gooaq-dev-realistic_base_mrr@10': 0.4618571428571429, 'gooaq-dev-realistic_base_ndcg@10': 0.523139973109566}
|
|
2025-03-28 11:47:02 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-evaluation dataset:
|
|
2025-03-28 11:49:56 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:49:56 - Base -> Reranked
|
|
2025-03-28 11:49:56 - MAP: 46.91 -> 62.12
|
|
2025-03-28 11:49:56 - MRR@10: 46.19 -> 61.81
|
|
2025-03-28 11:49:56 - NDCG@10: 52.31 -> 67.69
|
|
{'gooaq-dev-evaluation_map': 0.62120330763951, 'gooaq-dev-evaluation_mrr@10': 0.6180714285714286, 'gooaq-dev-evaluation_ndcg@10': 0.6769237801354084, 'gooaq-dev-evaluation_base_map': 0.46913299504084743, 'gooaq-dev-evaluation_base_mrr@10': 0.4618571428571429, 'gooaq-dev-evaluation_base_ndcg@10': 0.523139973109566}
|
|
|
|
2025-03-28 11:49:57 - Evaluating mixedbread-ai/mxbai-rerank-large-v1
|
|
mixedbread-ai/mxbai-rerank-large-v1
|
|
2025-03-28 11:49:57 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-realistic dataset:
|
|
2025-03-28 11:56:18 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 11:56:18 - Base -> Reranked
|
|
2025-03-28 11:56:18 - MAP: 46.91 -> 58.13
|
|
2025-03-28 11:56:18 - MRR@10: 46.19 -> 58.63
|
|
2025-03-28 11:56:18 - NDCG@10: 52.31 -> 63.38
|
|
{'gooaq-dev-realistic_map': 0.5813141616006278, 'gooaq-dev-realistic_mrr@10': 0.5862551587301587, 'gooaq-dev-realistic_ndcg@10': 0.6337779476332643, 'gooaq-dev-realistic_base_map': 0.46913299504084743, 'gooaq-dev-realistic_base_mrr@10': 0.4618571428571429, 'gooaq-dev-realistic_base_ndcg@10': 0.523139973109566}
|
|
2025-03-28 11:56:18 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-evaluation dataset:
|
|
2025-03-28 12:03:56 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 12:03:56 - Base -> Reranked
|
|
2025-03-28 12:03:56 - MAP: 46.91 -> 64.79
|
|
2025-03-28 12:03:56 - MRR@10: 46.19 -> 65.64
|
|
2025-03-28 12:03:56 - NDCG@10: 52.31 -> 70.65
|
|
{'gooaq-dev-evaluation_map': 0.64787303000053, 'gooaq-dev-evaluation_mrr@10': 0.6563722222222222, 'gooaq-dev-evaluation_ndcg@10': 0.7065480874647052, 'gooaq-dev-evaluation_base_map': 0.46913299504084743, 'gooaq-dev-evaluation_base_mrr@10': 0.4618571428571429, 'gooaq-dev-evaluation_base_ndcg@10': 0.523139973109566}
|
|
|
|
2025-03-28 12:03:58 - Evaluating Alibaba-NLP/gte-reranker-modernbert-base
|
|
Alibaba-NLP/gte-reranker-modernbert-base
|
|
2025-03-28 12:03:58 - CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev-realistic dataset:
|
|
2025-03-28 12:07:46 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 12:07:46 - Base -> Reranked
|
|
2025-03-28 12:07:46 - MAP: 46.91 -> 61.20
|
|
2025-03-28 12:07:46 - MRR@10: 46.19 -> 61.96
|
|
2025-03-28 12:07:46 - NDCG@10: 52.31 -> 65.92
|
|
{'gooaq-dev-realistic_map': 0.6119823635603096, 'gooaq-dev-realistic_mrr@10': 0.6195595238095238, 'gooaq-dev-realistic_ndcg@10': 0.6591716946749766, 'gooaq-dev-realistic_base_map': 0.46913299504084743, 'gooaq-dev-realistic_base_mrr@10': 0.4618571428571429, 'gooaq-dev-realistic_base_ndcg@10': 0.523139973109566}
|
|
2025-03-28 12:12:23 - Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 29.0, Mean 29.2, Max 30.0
|
|
2025-03-28 12:12:23 - Base -> Reranked
|
|
2025-03-28 12:12:23 - MAP: 46.91 -> 69.22
|
|
2025-03-28 12:12:23 - MRR@10: 46.19 -> 70.84
|
|
2025-03-28 12:12:23 - NDCG@10: 52.31 -> 74.88
|
|
{'gooaq-dev-evaluation_map': 0.6921869586576195, 'gooaq-dev-evaluation_mrr@10': 0.7084460317460317, 'gooaq-dev-evaluation_ndcg@10': 0.7487587539162411, 'gooaq-dev-evaluation_base_map': 0.46913299504084743, 'gooaq-dev-evaluation_base_mrr@10': 0.4618571428571429, 'gooaq-dev-evaluation_base_ndcg@10': 0.523139973109566}
|
|
""" |