colbert-muvera-micro / README.md

davidmezzetti

Upload model

5a4d956 about 1 month ago

preview code

raw

history blame contribute delete

31.6 kB

metadata

tags:
  - ColBERT
  - PyLate
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:640000
  - loss:Distillation
base_model: google/bert_uncased_L-2_H-128_A-2
datasets:
  - lightonai/ms-marco-en-bge-gemma-unnormalized
pipeline_tag: sentence-similarity
library_name: PyLate
license: apache-2.0
metrics:
  - MaxSim_accuracy@1
  - MaxSim_accuracy@3
  - MaxSim_accuracy@5
  - MaxSim_accuracy@10
  - MaxSim_precision@1
  - MaxSim_precision@3
  - MaxSim_precision@5
  - MaxSim_precision@10
  - MaxSim_recall@1
  - MaxSim_recall@3
  - MaxSim_recall@5
  - MaxSim_recall@10
  - MaxSim_ndcg@10
  - MaxSim_mrr@10
  - MaxSim_map@100
model-index:
  - name: ColBERT MUVERA Micro
    results:
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoClimateFEVER
          type: NanoClimateFEVER
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.26
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.36
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.4
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.58
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.26
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.12666666666666665
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.092
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.07800000000000001
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.11233333333333333
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.16066666666666665
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.184
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.3206666666666667
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.24408616743142095
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.33196825396825397
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.18128382432733356
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoDBPedia
          type: NanoDBPedia
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.68
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.86
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.92
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.94
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.68
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.6066666666666667
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.56
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.502
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.05322585293904511
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.16789568954347403
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.22988072374930787
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.35043982767195947
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.6003406576207015
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.7850000000000001
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.4687280514608297
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoFEVER
          type: NanoFEVER
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.72
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.78
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.84
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.9
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.72
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.2733333333333333
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.18
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.1
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.6866666666666668
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.7633333333333333
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.82
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.89
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.7955242043086649
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.7731666666666667
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.7676133768765347
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoFiQA2018
          type: NanoFiQA2018
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.3
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.54
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.58
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.66
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.3
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.2333333333333333
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.17200000000000004
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.10800000000000001
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.1770793650793651
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.3453492063492064
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.4009047619047619
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.4740952380952381
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.38709436118795515
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.4288015873015872
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.3297000135708943
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoHotpotQA
          type: NanoHotpotQA
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.94
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.94
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.98
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.94
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.5
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.31200000000000006
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.16599999999999995
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.47
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.75
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.78
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.83
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.8179728241272247
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.9512222222222222
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.7611883462001594
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoMSMARCO
          type: NanoMSMARCO
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.42
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.66
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.68
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.78
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.42
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.22
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.136
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.07800000000000001
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.42
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.66
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.68
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.78
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5976880189340548
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5393809523809523
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.5531015913611822
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoNFCorpus
          type: NanoNFCorpus
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.46
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.58
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.62
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.68
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.46
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.38
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.324
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.272
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.04276439372638386
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.07977851865112022
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.11439841040272719
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.1391695106171535
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.34241148621124995
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5320000000000001
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.14897381866568696
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoNQ
          type: NanoNQ
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.42
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.68
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.74
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.84
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.42
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.23333333333333328
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.15200000000000002
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.086
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.4
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.66
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.72
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.79
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.6184738987111722
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5763888888888888
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.5642312927870203
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoQuoraRetrieval
          type: NanoQuoraRetrieval
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.8
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.92
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.94
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.96
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.8
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3399999999999999
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.22399999999999998
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.11999999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.7239999999999999
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.8473333333333334
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.9006666666666666
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.9373333333333334
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.863105292852843
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.8611904761904764
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.8312823701317842
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoSCIDOCS
          type: NanoSCIDOCS
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.42
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.58
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.64
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.7
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.42
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.2866666666666667
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.20799999999999996
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.138
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.085
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.17666666666666664
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.21366666666666667
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.2826666666666667
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.2889801789850345
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5005
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.21685607444339383
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoArguAna
          type: NanoArguAna
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.2
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.44
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.5
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.64
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.2
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.14666666666666664
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.1
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.064
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.2
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.44
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.5
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.64
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.4151392430544827
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.3440555555555555
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.3521906424035335
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoSciFact
          type: NanoSciFact
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.58
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.76
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.82
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.86
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.58
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.2733333333333333
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.18
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.09399999999999999
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.555
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.735
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.8
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.84
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.7153590631749926
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.6798333333333333
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.6760413640032285
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoTouche2020
          type: NanoTouche2020
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.7551020408163265
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 1
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 1
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.7551020408163265
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.6734693877551019
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.6000000000000001
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.5285714285714286
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.050375728116040484
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.13379303377518686
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.19744749683082305
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.3328396127707909
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5927407647152685
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.8639455782312924
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.4115661843314275
            name: Maxsim Map@100
      - task:
          type: nano-beir
          name: Nano BEIR
        dataset:
          name: NanoBEIR mean
          type: NanoBEIR_mean
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.5350078492935635
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.7000000000000001
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.743076923076923
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.8107692307692307
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.5350078492935635
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.33026687598116167
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.24923076923076928
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.1795824175824176
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.3058804107585258
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.45537049602453755
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.5031511327862271
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.5851700658324468
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5599166277934665
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.6282656549799407
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.4817505346586929
            name: Maxsim Map@100

ColBERT MUVERA Micro

This is a PyLate model finetuned from google/bert_uncased_L-2_H-128_A-2 on the msmarco-en-bge-gemma-unnormalized dataset. It maps sentences & paragraphs to sequences of 128-dimensional dense vectors and can be used for semantic textual similarity using the MaxSim operator.

This model is trained with un-normalized scores, making it compatible with MUVERA fixed-dimensional encoding.

Usage (txtai)

This model can be used to build embeddings databases with txtai for semantic search and/or as a knowledge source for retrieval augmented generation (RAG).

Note: txtai 9.0+ is required for late interaction model support

import txtai

embeddings = txtai.Embeddings(
  sparse="neuml/colbert-muvera-micro",
  content=True
)
embeddings.index(documents())

# Run a query
embeddings.search("query to run")

Late interaction models excel as reranker pipelines.

from txtai.pipeline import Reranker, Similarity

similarity = Similarity(path="neuml/colbert-muvera-micro", lateencode=True)
ranker = Reranker(embeddings, similarity)
ranker("query to run")

Usage (PyLate)

Alternatively, the model can be loaded with PyLate.

from pylate import rank, models

queries = [
    "query A",
    "query B",
]

documents = [
    ["document A", "document B"],
    ["document 1", "document C", "document B"],
]

documents_ids = [
    [1, 2],
    [1, 3, 2],
]

model = models.ColBERT(
    model_name_or_path="neuml/colbert-muvera-micro",
)

queries_embeddings = model.encode(
    queries,
    is_query=True,
)

documents_embeddings = model.encode(
    documents,
    is_query=False,
)

reranked_documents = rank.rerank(
    documents_ids=documents_ids,
    queries_embeddings=queries_embeddings,
    documents_embeddings=documents_embeddings,
)

Full Model Architecture

ColBERT(
  (0): Transformer({'max_seq_length': 299, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Dense({'in_features': 128, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
)

Evaluation

BEIR Subset

The following table shows a subset of BEIR scored with the txtai benchmarks script.

Scores reported are ndcg@10 and grouped into the following three categories.

FULL multi-vector maxsim

Model	Parameters	ArguAna	NFCorpus	SciFact	Average
AnswerAI ColBERT Small v1	33M	0.4440	0.3649	0.7423	0.5171
ColBERT v2	110M	0.4595	0.3165	0.6456	0.4739
ColBERT MUVERA Micro	4M	0.3947	0.3235	0.6676	0.4619
ColBERT MUVERA Small	33M	0.4455	0.3502	0.7145	0.5034
GTE ModernColBERT v1	149M	0.4946	0.3717	0.7529	0.5397

MUVERA encoding + maxsim re-ranking of the top 100 results per MUVERA paper

Model	Parameters	ArguAna	NFCorpus	SciFact	Average
AnswerAI ColBERT Small v1	33M	0.0317	0.1135	0.0836	0.0763
ColBERT v2	110M	0.4562	0.3025	0.6278	0.4622
ColBERT MUVERA Micro	4M	0.3849	0.3095	0.6464	0.4469
ColBERT MUVERA Small	33M	0.4451	0.3537	0.7148	0.5045
GTE ModernColBERT v1	149M	0.0265	0.1052	0.0556	0.0624

MUVERA encoding only

Model	Parameters	ArguAna	NFCorpus	SciFact	Average
AnswerAI ColBERT Small v1	33M	0.0024	0.0201	0.0047	0.0091
ColBERT v2	110M	0.3463	0.2356	0.5002	0.3607
ColBERT MUVERA Micro	4M	0.2795	0.2348	0.4875	0.3339
ColBERT MUVERA Small	33M	0.3850	0.2928	0.6357	0.4378
GTE ModernColBERT v1	149M	0.0003	0.0203	0.0013	0.0073

Note: The scores reported don't match scores reported in the respective papers due to different default settings in the txtai benchmark scripts.

As noted earlier, models trained with min-max score normalization don't perform well with MUVERA encoding. See this GitHub Issue for more.

In reviewing the scores, this model is surprisingly and unreasonably competitive with the original ColBERT v2 model at only 3% of the size!

Nano BEIR

Dataset: NanoBEIR_mean
Evaluated with pylate.evaluation.nano_beir_evaluator.NanoBEIREvaluator

Metric	Value
MaxSim_accuracy@1	0.535
MaxSim_accuracy@3	0.7
MaxSim_accuracy@5	0.7431
MaxSim_accuracy@10	0.8108
MaxSim_precision@1	0.535
MaxSim_precision@3	0.3303
MaxSim_precision@5	0.2492
MaxSim_precision@10	0.1796
MaxSim_recall@1	0.3059
MaxSim_recall@3	0.4554
MaxSim_recall@5	0.5032
MaxSim_recall@10	0.5852
MaxSim_ndcg@10	0.5599
MaxSim_mrr@10	0.6283
MaxSim_map@100	0.4818

Training Details

Training Hyperparameters

Non-Default Hyperparameters

eval_strategy: steps
per_device_train_batch_size: 32
learning_rate: 0.0003
num_train_epochs: 1
warmup_ratio: 0.05
bf16: True

All Hyperparameters

Click to expand

overwrite_output_dir: False
do_predict: False
eval_strategy: steps
prediction_loss_only: True
per_device_train_batch_size: 32
per_device_eval_batch_size: 8
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 1
eval_accumulation_steps: None
torch_empty_cache_steps: None
learning_rate: 0.0003
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: -1
lr_scheduler_type: linear
lr_scheduler_kwargs: {}
warmup_ratio: 0.05
warmup_steps: 0
log_level: passive
log_level_replica: warning
log_on_each_node: True
logging_nan_inf_filter: True
save_safetensors: True
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
use_mps_device: False
seed: 42
data_seed: None
jit_mode_eval: False
use_ipex: False
bf16: True
fp16: False
fp16_opt_level: O1
half_precision_backend: auto
bf16_full_eval: False
fp16_full_eval: False
tf32: None
local_rank: 0
ddp_backend: None
tpu_num_cores: None
tpu_metrics_debug: False
debug: []
dataloader_drop_last: False
dataloader_num_workers: 0
dataloader_prefetch_factor: None
past_index: -1
disable_tqdm: False
remove_unused_columns: True
label_names: None
load_best_model_at_end: False
ignore_data_skip: False
fsdp: []
fsdp_min_num_params: 0
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap: None
accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed: None
label_smoothing_factor: 0.0
optim: adamw_torch
optim_args: None
adafactor: False
group_by_length: False
length_column_name: length
ddp_find_unused_parameters: None
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
dataloader_pin_memory: True
dataloader_persistent_workers: False
skip_memory_metrics: True
use_legacy_prediction_loop: False
push_to_hub: False
resume_from_checkpoint: None
hub_model_id: None
hub_strategy: every_save
hub_private_repo: None
hub_always_push: False
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
include_inputs_for_metrics: False
include_for_metrics: []
eval_do_concat_batches: True
fp16_backend: auto
push_to_hub_model_id: None
push_to_hub_organization: None
mp_parameters:
auto_find_batch_size: False
full_determinism: False
torchdynamo: None
ray_scope: last
ddp_timeout: 1800
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
include_tokens_per_second: False
include_num_input_tokens_seen: False
neftune_noise_alpha: None
optim_target_modules: None
batch_eval_metrics: False
eval_on_start: False
use_liger_kernel: False
eval_use_gather_object: False
average_tokens_across_devices: False
prompts: None
batch_sampler: batch_sampler
multi_dataset_batch_sampler: proportional

Framework Versions

Python: 3.10.18
Sentence Transformers: 4.0.2
PyLate: 1.3.0
Transformers: 4.52.3
PyTorch: 2.8.0+cu128
Accelerate: 1.10.1
Datasets: 4.0.0
Tokenizers: 0.21.4

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084"
}

PyLate

@misc{PyLate,
title={PyLate: Flexible Training and Retrieval for Late Interaction Models},
author={Chaffin, Antoine and Sourty, Raphaël},
url={https://github.com/lightonai/pylate},
year={2024}
}