Christina Theodoris commited on
Commit
76a78a0
·
1 Parent(s): a67c9c0

add option for relabeling data from prior label class dict, update dict paths in manifest

Browse files
MANIFEST.in CHANGED
@@ -1,9 +1,9 @@
1
- include geneformer/gene_median_dictionary_gc104m.pkl
2
- include geneformer/gene_name_id_dict_gc104m.pkl
3
- include geneformer/ensembl_mapping_dict_gc104m.pkl
4
- include geneformer/token_dictionary_gc104m.pkl
5
 
6
- include geneformer/gene_dictionaries_30m/gene_median_dictionary_gc30m.pkl
7
- include geneformer/gene_dictionaries_30m/gene_name_id_dict_gc30m.pkl
8
- include geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30m.pkl
9
- include geneformer/gene_dictionaries_30m/token_dictionary_gc30m.pkl
 
1
+ include geneformer/gene_median_dictionary_gc104M.pkl
2
+ include geneformer/gene_name_id_dict_gc104M.pkl
3
+ include geneformer/ensembl_mapping_dict_gc104M.pkl
4
+ include geneformer/token_dictionary_gc104M.pkl
5
 
6
+ include geneformer/gene_dictionaries_30m/gene_median_dictionary_gc30M.pkl
7
+ include geneformer/gene_dictionaries_30m/gene_name_id_dict_gc30M.pkl
8
+ include geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30M.pkl
9
+ include geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl
README.md CHANGED
@@ -13,9 +13,9 @@ Geneformer is a foundational transformer model pretrained on a large-scale corpu
13
  - See [geneformer.readthedocs.io](https://geneformer.readthedocs.io) for documentation.
14
 
15
  # Model Description
16
- Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes representing a broad range of human tissues. Geneformer V1 was originally pretrained in June 2021 on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a corpus comprised of ~30 million human single cell transcriptomes. We excluded cells with high mutational burdens (e.g. malignant cells and immortalized cell lines) that could lead to substantial network rewiring without companion genome sequencing to facilitate interpretation. The current updated Geneformer V2 is pretrained on ~104 million human single cell transcriptomes (non-cancer). The cancer continual learning V2 variant was continually pretrrained on ~14 million cancer transcriptomes to yield a cancer domain-tuned model.
17
 
18
- Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell scaled by their expression across the entire Genecorpus (-30M for V1, -104M for V2). The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across the pretraining corpus to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by scaling them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
19
 
20
  The rank value encoding of each single cell’s transcriptome then proceeds through N layers of transformer encoder units, where N varies dependent on the model size. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.
21
 
 
13
  - See [geneformer.readthedocs.io](https://geneformer.readthedocs.io) for documentation.
14
 
15
  # Model Description
16
+ Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes representing a broad range of human tissues. Geneformer V1 was originally pretrained in June 2021 on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a corpus comprised of ~30 million human single cell transcriptomes. We excluded cells with high mutational burdens (e.g. malignant cells and immortalized cell lines) that could lead to substantial network rewiring without companion genome sequencing to facilitate interpretation. The current updated Geneformer V2 is pretrained on ~104 million human single cell transcriptomes (non-cancer). The cancer continual learning V2 variant was continually pretrained on ~14 million cancer transcriptomes to yield a cancer domain-tuned model.
17
 
18
+ Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell scaled by their expression across the entire Genecorpus (~30M for V1, ~104M for V2). The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across the pretraining corpus to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by scaling them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
19
 
20
  The rank value encoding of each single cell’s transcriptome then proceeds through N layers of transformer encoder units, where N varies dependent on the model size. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.
21
 
geneformer/classifier.py CHANGED
@@ -368,6 +368,7 @@ class Classifier:
368
  attr_to_balance=None,
369
  max_trials=100,
370
  pval_threshold=0.1,
 
371
  ):
372
  """
373
  Prepare data for cell state or gene classification.
@@ -410,6 +411,10 @@ class Classifier:
410
  pval_threshold : None, float
411
  | P-value threshold to use for attribute balancing across splits
412
  | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
 
 
 
 
413
  """
414
 
415
  if test_size is None:
@@ -453,8 +458,13 @@ class Classifier:
453
  data = cu.rename_cols(data, self.cell_state_dict["state_key"])
454
 
455
  # convert classes to numerical labels and save as id_class_dict
 
 
 
 
 
456
  data, id_class_dict = cu.label_classes(
457
- self.classifier, data, self.cell_state_dict, self.nproc
458
  )
459
 
460
  elif self.classifier == "gene":
 
368
  attr_to_balance=None,
369
  max_trials=100,
370
  pval_threshold=0.1,
371
+ id_class_dict_path=None,
372
  ):
373
  """
374
  Prepare data for cell state or gene classification.
 
411
  pval_threshold : None, float
412
  | P-value threshold to use for attribute balancing across splits
413
  | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
414
+ id_class_dict_path : Path
415
+ | Path to *_id_class_dict.pkl from prior run of prepare_data to reuse for labeling new data
416
+ | Dictionary with keys being numeric class labels and values being original dataset class labels
417
+ | Note: only available for CellClassifiers
418
  """
419
 
420
  if test_size is None:
 
458
  data = cu.rename_cols(data, self.cell_state_dict["state_key"])
459
 
460
  # convert classes to numerical labels and save as id_class_dict
461
+ if id_class_dict_path is not None:
462
+ with open(id_class_dict_path,"rb") as fp:
463
+ id_class_dict = pickle.load(fp)
464
+ else:
465
+ id_class_dict = None
466
  data, id_class_dict = cu.label_classes(
467
+ self.classifier, data, self.cell_state_dict, self.nproc, id_class_dict,
468
  )
469
 
470
  elif self.classifier == "gene":
geneformer/classifier_utils.py CHANGED
@@ -94,7 +94,7 @@ def remove_rare(data, rare_threshold, label, nproc):
94
  return data
95
 
96
 
97
- def label_classes(classifier, data, gene_class_dict, nproc):
98
  if classifier == "cell":
99
  label_set = set(data["label"])
100
  elif classifier == "gene":
@@ -113,8 +113,11 @@ def label_classes(classifier, data, gene_class_dict, nproc):
113
  )
114
  raise
115
 
116
- class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
117
- id_class_dict = {v: k for k, v in class_id_dict.items()}
 
 
 
118
 
119
  if classifier == "gene":
120
  inverse_gene_class_dict = {}
 
94
  return data
95
 
96
 
97
+ def label_classes(classifier, data, gene_class_dict, nproc, id_class_dict):
98
  if classifier == "cell":
99
  label_set = set(data["label"])
100
  elif classifier == "gene":
 
113
  )
114
  raise
115
 
116
+ if id_class_dict is None:
117
+ class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
118
+ id_class_dict = {v: k for k, v in class_id_dict.items()}
119
+ else:
120
+ class_id_dict = {v: k for k, v in id_class_dict.items()}
121
 
122
  if classifier == "gene":
123
  inverse_gene_class_dict = {}
geneformer/perturber_utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import itertools as it
2
  import logging
 
3
  import pickle
4
  from collections import defaultdict
5
  from pathlib import Path
 
1
  import itertools as it
2
  import logging
3
+ import os
4
  import pickle
5
  from collections import defaultdict
6
  from pathlib import Path