π§ IonNTxPred: LLM-based Prediction and Designing of Ion Channel Impairing Proteins
IonNTxPred is a fine-tuned transformer model built on top of the esm2_t33_650M_UR50D protein language model. It is specifically trained for binary classification of peptide sequences β predicting whether a peptide is ion channel modulating or non-modulating.
π― Use Case: Accelerating the identification and design of safe peptide therapeutics by filtering out ion channel impairing/modulating candidates early in the drug development pipeline.
πΌοΈ IonNTxPred Workflow
𧬠Model Highlights
- Base Model: Facebookβs ESM2-t33 (650M parameters)
- Fine-Tuning Task: Ion channel toxins prediction (binary classification)
- Input: Protein/Peptide sequences
- Output: Binary label β
1
(Ion Channel Modulating),0
(non-modulating) - Architecture: ESM2 encoder + linear classification head
ποΈ Files Included
config.json
β Contains configuration settings for the model architecture, hyperparameters, and training details.model.safetensors
β This is the actual trained model weights saved in the SafeTensors format, which is safer and faster than the traditional .bin files.special_tokens_map.json
β Stores mappings for special tokens, like [CLS], [SEP], or any custom tokens used in your tokenizer.tokenizer_config.json
β Contains tokenizer-related settings (like vocabulary size, tokenization method).vocab.txt
β Lists all tokens and their corresponding IDs; it's essential for text tokenization.
π How to Use
Predict Sodium channel modulating proteins
π§ Install Dependencies
pip install torch esm biopython huggingface_hub
### Loading the Model from Hugging Face
```python
import torch
import esm
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from transformers import AutoTokenizer, EsmForSequenceClassification
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Downloading fine-tuned models & weights...")
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_na"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
weights_path = hf_hub_download(repo_id=repo_id, filename="saved_model_t33_na/model.safetensors")
# Create a simple classifier model
class ProteinClassifier(torch.nn.Module):
def __init__(self, esm_model):
super().__init__()
self.esm_model = esm_model
# We'll dynamically determine the classifier layer size
self.classifier = None
def forward(self, tokens):
with torch.no_grad():
results = self.esm_model(tokens, repr_layers=[33], return_contacts=False)
embeddings = results["representations"][33].mean(1)
return self.classifier(embeddings)
# Initialize model
classifier = ProteinClassifier(model)
# Load the state dict and determine architecture
state_dict = load_file(weights_path, device=str(device))
# Find the classifier layer (look for a weight matrix)
for key, tensor in state_dict.items():
if len(tensor.shape) == 2: # This should be the weight matrix
num_classes = tensor.shape[0]
embedding_dim = tensor.shape[1]
print(f"Found classifier layer: {key} (input_dim={embedding_dim}, output_dim={num_classes})")
# Initialize the classifier layer
classifier.classifier = torch.nn.Linear(embedding_dim, num_classes).to(device)
# Create new state dict with proper names
new_state_dict = {
'classifier.weight': state_dict[key],
'classifier.bias': state_dict[key.replace('weight', 'bias')]
}
classifier.load_state_dict(new_state_dict, strict=False)
break
# Move to device and set to eval mode
classifier = classifier.to(device)
classifier.eval()
print(f"\nModel successfully loaded on {device} and ready for inference!")
π§ͺ Example Usage (Optional)
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch
# Define the repository ID and subfolder
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_na"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Function to make predictions
def make_predictions(model, inputs, device):
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
return probs
# Example protein sequence
protein_sequence = "MKASTLVVIFIVIFITISSFSIHDVQASGVEKREQKDCLKKLKLCKENKDCCSKSCKRRGTNIEKRCR"
# Tokenize the input sequence
inputs = tokenizer(protein_sequence, return_tensors="pt", truncation=True, padding=True)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Make predictions
prediction = make_predictions(model, inputs, device)
# Apply threshold for final classification
threshold = 0.5
final_prediction = "Na+ channel modulating" if prediction[0] > threshold else "Not Na+ channel modulating"
print(f"π Prediction Probability: {prediction[0]:.4f}")
print(f"π·οΈ Final Prediction: {final_prediction}")
Predict Potassium channel modulating proteins
π§ Install Dependencies
pip install torch esm biopython huggingface_hub
### Loading the Model from Hugging Face
```python
import torch
import esm
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from transformers import AutoTokenizer, EsmForSequenceClassification
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Downloading fine-tuned models & weights...")
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_k"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
weights_path = hf_hub_download(repo_id=repo_id, filename="saved_model_t33_na/model.safetensors")
# Create a simple classifier model
class ProteinClassifier(torch.nn.Module):
def __init__(self, esm_model):
super().__init__()
self.esm_model = esm_model
# We'll dynamically determine the classifier layer size
self.classifier = None
def forward(self, tokens):
with torch.no_grad():
results = self.esm_model(tokens, repr_layers=[33], return_contacts=False)
embeddings = results["representations"][33].mean(1)
return self.classifier(embeddings)
# Initialize model
classifier = ProteinClassifier(model)
# Load the state dict and determine architecture
state_dict = load_file(weights_path, device=str(device))
# Find the classifier layer (look for a weight matrix)
for key, tensor in state_dict.items():
if len(tensor.shape) == 2: # This should be the weight matrix
num_classes = tensor.shape[0]
embedding_dim = tensor.shape[1]
print(f"Found classifier layer: {key} (input_dim={embedding_dim}, output_dim={num_classes})")
# Initialize the classifier layer
classifier.classifier = torch.nn.Linear(embedding_dim, num_classes).to(device)
# Create new state dict with proper names
new_state_dict = {
'classifier.weight': state_dict[key],
'classifier.bias': state_dict[key.replace('weight', 'bias')]
}
classifier.load_state_dict(new_state_dict, strict=False)
break
# Move to device and set to eval mode
classifier = classifier.to(device)
classifier.eval()
print(f"\nModel successfully loaded on {device} and ready for inference!")
π§ͺ Example Usage (Optional)
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch
# Define the repository ID and subfolder
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_k"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Function to make predictions
def make_predictions(model, inputs, device):
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
return probs
# Example protein sequence
protein_sequence = "MKASTLVVIFIVIFITISSFSIHDVQASGVEKREQKDCLKKLKLCKENKDCCSKSCKRRGTNIEKRCR"
# Tokenize the input sequence
inputs = tokenizer(protein_sequence, return_tensors="pt", truncation=True, padding=True)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Make predictions
prediction = make_predictions(model, inputs, device)
# Apply threshold for final classification
threshold = 0.5
final_prediction = "K+ channel modulating" if prediction[0] > threshold else "Not K+ channel modulating"
print(f"π Prediction Probability: {prediction[0]:.4f}")
print(f"π·οΈ Final Prediction: {final_prediction}")
Predict Calcilum channel modulating proteins
π§ Install Dependencies
pip install torch esm biopython huggingface_hub
### Loading the Model from Hugging Face
```python
import torch
import esm
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from transformers import AutoTokenizer, EsmForSequenceClassification
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Downloading fine-tuned models & weights...")
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_ca"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
weights_path = hf_hub_download(repo_id=repo_id, filename="saved_model_t33_ca/model.safetensors")
# Create a simple classifier model
class ProteinClassifier(torch.nn.Module):
def __init__(self, esm_model):
super().__init__()
self.esm_model = esm_model
# We'll dynamically determine the classifier layer size
self.classifier = None
def forward(self, tokens):
with torch.no_grad():
results = self.esm_model(tokens, repr_layers=[33], return_contacts=False)
embeddings = results["representations"][33].mean(1)
return self.classifier(embeddings)
# 4. Initialize model
classifier = ProteinClassifier(model)
# 5. Load the state dict and determine architecture
state_dict = load_file(weights_path, device=str(device))
# Find the classifier layer (look for a weight matrix)
for key, tensor in state_dict.items():
if len(tensor.shape) == 2: # This should be the weight matrix
num_classes = tensor.shape[0]
embedding_dim = tensor.shape[1]
print(f"Found classifier layer: {key} (input_dim={embedding_dim}, output_dim={num_classes})")
# Initialize the classifier layer
classifier.classifier = torch.nn.Linear(embedding_dim, num_classes).to(device)
# Create new state dict with proper names
new_state_dict = {
'classifier.weight': state_dict[key],
'classifier.bias': state_dict[key.replace('weight', 'bias')]
}
classifier.load_state_dict(new_state_dict, strict=False)
break
# Move to device and set to eval mode
classifier = classifier.to(device)
classifier.eval()
print(f"\nModel successfully loaded on {device} and ready for inference!")
π§ͺ Example Usage (Optional)
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch
# Define the repository ID and subfolder
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_ca"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Function to make predictions
def make_predictions(model, inputs, device):
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
return probs
# Example protein sequence
protein_sequence = "MKASTLVVIFIVIFITISSFSIHDVQASGVEKREQKDCLKKLKLCKENKDCCSKSCKRRGTNIEKRCR"
# Tokenize the input sequence
inputs = tokenizer(protein_sequence, return_tensors="pt", truncation=True, padding=True)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Make predictions
prediction = make_predictions(model, inputs, device)
# Apply threshold for final classification
threshold = 0.5
final_prediction = "Ca++ channel modulating" if prediction[0] > threshold else "Not Ca++ channel modulating"
print(f"π Prediction Probability: {prediction[0]:.4f}")
print(f"π·οΈ Final Prediction: {final_prediction}")
Predict other ion channel modulating proteins
π§ Install Dependencies
pip install torch esm biopython huggingface_hub
### Loading the Model from Hugging Face
```python
import torch
import esm
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from transformers import AutoTokenizer, EsmForSequenceClassification
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Downloading fine-tuned models & weights...")
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_other"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
weights_path = hf_hub_download(repo_id=repo_id, filename="saved_model_t33_na/model.safetensors")
# Create a simple classifier model
class ProteinClassifier(torch.nn.Module):
def __init__(self, esm_model):
super().__init__()
self.esm_model = esm_model
# We'll dynamically determine the classifier layer size
self.classifier = None
def forward(self, tokens):
with torch.no_grad():
results = self.esm_model(tokens, repr_layers=[33], return_contacts=False)
embeddings = results["representations"][33].mean(1)
return self.classifier(embeddings)
# Initialize model
classifier = ProteinClassifier(model)
# Load the state dict and determine architecture
state_dict = load_file(weights_path, device=str(device))
# Find the classifier layer (look for a weight matrix)
for key, tensor in state_dict.items():
if len(tensor.shape) == 2: # This should be the weight matrix
num_classes = tensor.shape[0]
embedding_dim = tensor.shape[1]
print(f"Found classifier layer: {key} (input_dim={embedding_dim}, output_dim={num_classes})")
# Initialize the classifier layer
classifier.classifier = torch.nn.Linear(embedding_dim, num_classes).to(device)
# Create new state dict with proper names
new_state_dict = {
'classifier.weight': state_dict[key],
'classifier.bias': state_dict[key.replace('weight', 'bias')]
}
classifier.load_state_dict(new_state_dict, strict=False)
break
# Move to device and set to eval mode
classifier = classifier.to(device)
classifier.eval()
print(f"\nModel successfully loaded on {device} and ready for inference!")
π§ͺ Example Usage (Optional)
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch
# Define the repository ID and subfolder
repo_id = "anandr88/IonNTxPred"
subfolder = "saved_model_t33_other"
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder=subfolder)
model = EsmForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder)
# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Function to make predictions
def make_predictions(model, inputs, device):
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
return probs
# Example protein sequence
protein_sequence = "MKASTLVVIFIVIFITISSFSIHDVQASGVEKREQKDCLKKLKLCKENKDCCSKSCKRRGTNIEKRCR"
# Tokenize the input sequence
inputs = tokenizer(protein_sequence, return_tensors="pt", truncation=True, padding=True)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Make predictions
prediction = make_predictions(model, inputs, device)
# Apply threshold for final classification
threshold = 0.5
final_prediction = "Other channel modulating" if prediction[0] > threshold else "Not Other channel modulating"
print(f"π Prediction Probability: {prediction[0]:.4f}")
print(f"π·οΈ Final Prediction: {final_prediction}")
π Applications
- Ion channel impairing proteins filtering in therapeutic design
- Toxicity scanning of synthetic peptides
- Dataset annotation for bioactivity studies
- Educational use in bioinformatics and deep learning for proteins
π Related Links
- π¬ Project Web Server: IonNTxpred Web Tool
- π§Ύ Documentation & Source: GitHub β raghavagps/IonNTxPred
π§ Citation
π Rathore et al.
A Large Language Model for Predicting Neurotoxic Peptides and Neurotoxins.
#Coming Soon#
π¨βπ¬ Start using IonNTxPred today to enhance your protein/peptide screening pipeline with the power of transformer-based intelligence!
Model tree for raghavagps-group/IonNTxPred
Base model
facebook/esm2_t33_650M_UR50D