update dist multitask example name

Files changed (1) hide show

examples/distributed_multitask_cell_classification.ipynb +149 -0

examples/distributed_multitask_cell_classification.ipynb ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3266a7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from geneformer import MTLClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e12ac9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define paths\n",
+    "pretrained_path = \"/path/to/pretrained/Geneformer/model\" \n",
+    "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
+    "train_path = \"/path/to/train/data.dataset\"\n",
+    "val_path = \"/path/to/val/data.dataset\"\n",
+    "test_path = \"/path/to/test/data.dataset\"\n",
+    "results_dir = \"/path/to/results/directory\"\n",
+    "model_save_path = \"/path/to/model/save/path\"\n",
+    "tensorboard_log_dir = \"/path/to/tensorboard/log/dir\"\n",
+    "\n",
+    "# Define tasks and hyperparameters\n",
+    "# task_columns should be a list of column names from your dataset\n",
+    "# Each column represents a specific classification task (e.g. cell type, disease state)\n",
+    "task_columns = [\"cell_type\", \"disease_state\"]  # Example task columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9bd7562",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check GPU environment\n",
+    "num_gpus = torch.cuda.device_count()\n",
+    "use_distributed = num_gpus > 1\n",
+    "print(f\"Number of GPUs detected: {num_gpus}\")\n",
+    "print(f\"Using distributed training: {use_distributed}\")\n",
+    "\n",
+    "# Set environment variables for distributed training when multiple GPUs are available\n",
+    "if use_distributed:\n",
+    "    os.environ[\"MASTER_ADDR\"] = \"localhost\"  # hostname\n",
+    "    os.environ[\"MASTER_PORT\"] = \"12355\"      # Choose an available port\n",
+    "    print(\"Distributed environment variables set.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6ff3618",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Define Hyperparameters for Optimization\n",
+    "hyperparameters = {\n",
+    "    \"learning_rate\": {\"type\": \"float\", \"low\": 1e-5, \"high\": 1e-3, \"log\": True},\n",
+    "    \"warmup_ratio\": {\"type\": \"float\", \"low\": 0.005, \"high\": 0.01},\n",
+    "    \"weight_decay\": {\"type\": \"float\", \"low\": 0.01, \"high\": 0.1},\n",
+    "    \"dropout_rate\": {\"type\": \"float\", \"low\": 0.0, \"high\": 0.7},\n",
+    "    \"lr_scheduler_type\": {\"type\": \"categorical\", \"choices\": [\"cosine\"]},\n",
+    "    \"task_weights\": {\"type\": \"float\", \"low\": 0.1, \"high\": 2.0},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f665c5a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mc = MTLClassifier(\n",
+    "    task_columns=task_columns,  # Our defined classification tasks\n",
+    "    study_name=\"MTLClassifier_distributed\",\n",
+    "    pretrained_path=pretrained_path,\n",
+    "    train_path=train_path,\n",
+    "    val_path=val_path,\n",
+    "    test_path=test_path,\n",
+    "    model_save_path=model_save_path,\n",
+    "    results_dir=results_dir,\n",
+    "    tensorboard_log_dir=tensorboard_log_dir,\n",
+    "    hyperparameters=hyperparameters,\n",
+    "    # Distributed training parameters\n",
+    "    distributed_training=use_distributed,  # Enable distributed training if multiple GPUs available\n",
+    "    master_addr=\"localhost\" if use_distributed else None,\n",
+    "    master_port=\"12355\" if use_distributed else None,\n",
+    "    # Other training parameters\n",
+    "    n_trials=15,  # Number of trials for hyperparameter optimization\n",
+    "    epochs=1,     # Number of training epochs (1 suggested to prevent overfitting)\n",
+    "    batch_size=8, # Adjust based on available GPU memory\n",
+    "    gradient_accumulation_steps=4,  # Accumulate gradients over multiple steps\n",
+    "    gradient_clipping=True,         # Enable gradient clipping for stability\n",
+    "    max_grad_norm=1.0,              # Set maximum gradient norm\n",
+    "    seed=42\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f69f7b6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run Hyperparameter Optimization with Distributed Training\n",
+    "if __name__ == \"__main__\":\n",
+    "    # This guard is required for distributed training to prevent\n",
+    "    # infinite subprocess spawning when using torch.multiprocessing\n",
+    "    mc.run_optuna_study()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3affd5dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate the Model on Test Data\n",
+    "if __name__ == \"__main__\":\n",
+    "    mc.load_and_evaluate_test_model()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bio",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}