add option for relabeling data from prior label class dict, update dict paths in manifest

Files changed (5) hide show

MANIFEST.in +8 -8
README.md +2 -2
geneformer/classifier.py +11 -1
geneformer/classifier_utils.py +6 -3
geneformer/perturber_utils.py +1 -0

MANIFEST.in CHANGED Viewed

@@ -1,9 +1,9 @@
-include geneformer/gene_median_dictionary_gc104m.pkl
-include geneformer/gene_name_id_dict_gc104m.pkl
-include geneformer/ensembl_mapping_dict_gc104m.pkl
-include geneformer/token_dictionary_gc104m.pkl
-include geneformer/gene_dictionaries_30m/gene_median_dictionary_gc30m.pkl
-include geneformer/gene_dictionaries_30m/gene_name_id_dict_gc30m.pkl
-include geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30m.pkl
-include geneformer/gene_dictionaries_30m/token_dictionary_gc30m.pkl

+include geneformer/gene_median_dictionary_gc104M.pkl
+include geneformer/gene_name_id_dict_gc104M.pkl
+include geneformer/ensembl_mapping_dict_gc104M.pkl
+include geneformer/token_dictionary_gc104M.pkl
+include geneformer/gene_dictionaries_30m/gene_median_dictionary_gc30M.pkl
+include geneformer/gene_dictionaries_30m/gene_name_id_dict_gc30M.pkl
+include geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30M.pkl
+include geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl

README.md CHANGED Viewed

@@ -13,9 +13,9 @@ Geneformer is a foundational transformer model pretrained on a large-scale corpu
 - See [geneformer.readthedocs.io](https://geneformer.readthedocs.io) for documentation.
 # Model Description
-Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes representing a broad range of human tissues. Geneformer V1 was originally pretrained in June 2021 on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a corpus comprised of ~30 million human single cell transcriptomes. We excluded cells with high mutational burdens (e.g. malignant cells and immortalized cell lines) that could lead to substantial network rewiring without companion genome sequencing to facilitate interpretation. The current updated Geneformer V2 is pretrained on ~104 million human single cell transcriptomes (non-cancer). The cancer continual learning V2 variant was continually pretrrained on ~14 million cancer transcriptomes to yield a cancer domain-tuned model.
-Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell scaled by their expression across the entire Genecorpus (-30M for V1, -104M for V2). The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across the pretraining corpus to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by scaling them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
 The rank value encoding of each single cell’s transcriptome then proceeds through N layers of transformer encoder units, where N varies dependent on the model size. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.

 - See [geneformer.readthedocs.io](https://geneformer.readthedocs.io) for documentation.
 # Model Description
+Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes representing a broad range of human tissues. Geneformer V1 was originally pretrained in June 2021 on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a corpus comprised of ~30 million human single cell transcriptomes. We excluded cells with high mutational burdens (e.g. malignant cells and immortalized cell lines) that could lead to substantial network rewiring without companion genome sequencing to facilitate interpretation. The current updated Geneformer V2 is pretrained on ~104 million human single cell transcriptomes (non-cancer). The cancer continual learning V2 variant was continually pretrained on ~14 million cancer transcriptomes to yield a cancer domain-tuned model.
+Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell scaled by their expression across the entire Genecorpus (~30M for V1, ~104M for V2). The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across the pretraining corpus to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by scaling them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
 The rank value encoding of each single cell’s transcriptome then proceeds through N layers of transformer encoder units, where N varies dependent on the model size. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.

geneformer/classifier.py CHANGED Viewed

@@ -368,6 +368,7 @@ class Classifier:
         attr_to_balance=None,
         max_trials=100,
         pval_threshold=0.1,
     ):
         """
         Prepare data for cell state or gene classification.
@@ -410,6 +411,10 @@ class Classifier:
         pval_threshold : None, float
             | P-value threshold to use for attribute balancing across splits
             | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
         """
         if test_size is None:
@@ -453,8 +458,13 @@ class Classifier:
             data = cu.rename_cols(data, self.cell_state_dict["state_key"])
             # convert classes to numerical labels and save as id_class_dict
             data, id_class_dict = cu.label_classes(
-                self.classifier, data, self.cell_state_dict, self.nproc
             )
         elif self.classifier == "gene":

         attr_to_balance=None,
         max_trials=100,
         pval_threshold=0.1,
+        id_class_dict_path=None,
     ):
         """
         Prepare data for cell state or gene classification.
         pval_threshold : None, float
             | P-value threshold to use for attribute balancing across splits
             | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
+        id_class_dict_path : Path
+            | Path to *_id_class_dict.pkl from prior run of prepare_data to reuse for labeling new data
+            | Dictionary with keys being numeric class labels and values being original dataset class labels
+            | Note: only available for CellClassifiers
         """
         if test_size is None:
             data = cu.rename_cols(data, self.cell_state_dict["state_key"])
             # convert classes to numerical labels and save as id_class_dict
+            if id_class_dict_path is not None:
+                with open(id_class_dict_path,"rb") as fp:
+                    id_class_dict = pickle.load(fp)
+            else:
+                id_class_dict = None
             data, id_class_dict = cu.label_classes(
+                self.classifier, data, self.cell_state_dict, self.nproc, id_class_dict,
             )
         elif self.classifier == "gene":

geneformer/classifier_utils.py CHANGED Viewed

@@ -94,7 +94,7 @@ def remove_rare(data, rare_threshold, label, nproc):
     return data
-def label_classes(classifier, data, gene_class_dict, nproc):
     if classifier == "cell":
         label_set = set(data["label"])
     elif classifier == "gene":
@@ -113,8 +113,11 @@ def label_classes(classifier, data, gene_class_dict, nproc):
             )
             raise
-    class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
-    id_class_dict = {v: k for k, v in class_id_dict.items()}
     if classifier == "gene":
         inverse_gene_class_dict = {}

     return data
+def label_classes(classifier, data, gene_class_dict, nproc, id_class_dict):
     if classifier == "cell":
         label_set = set(data["label"])
     elif classifier == "gene":
             )
             raise
+    if id_class_dict is None:
+        class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
+        id_class_dict = {v: k for k, v in class_id_dict.items()}
+    else:
+        class_id_dict = {v: k for k, v in id_class_dict.items()}
     if classifier == "gene":
         inverse_gene_class_dict = {}

geneformer/perturber_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import itertools as it
 import logging
 import pickle
 from collections import defaultdict
 from pathlib import Path

 import itertools as it
 import logging
+import os
 import pickle
 from collections import defaultdict
 from pathlib import Path