Spaces:

prihodad
/

promb-humanness

Running

App Files Files Community

David Prihoda commited on May 13

Commit

5500725

1 Parent(s): fb77734

Promb

Browse files

Files changed (4) hide show

Dockerfile +0 -2
requirements.txt +2 -2
src/example.py +0 -34
src/streamlit_app.py +156 -67

Dockerfile CHANGED Viewed

@@ -14,8 +14,6 @@ COPY src/ ./src/
 RUN pip3 install -r requirements.txt
-ENV HF_HUB_CACHE="/tmp/huggingface"
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health

 RUN pip3 install -r requirements.txt
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 altair
 pandas
 streamlit
-transformers
-torch

 altair
 pandas
 streamlit
+logomaker
+promb

src/example.py DELETED Viewed

@@ -1,34 +0,0 @@
-import torch
-import pandas as pd
-from transformers import RobertaForMaskedLM, RobertaTokenizer
-tokenizer = RobertaTokenizer.from_pretrained('prihodad/biophi-sapiens1-tokenizer')
-vh_model = RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vh')
-vl_model = RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vl')
-def sapiens_predict(tokenizer, model, seq, probs=True):
-    encoded_input = tokenizer(seq, return_tensors='pt')
-    with torch.no_grad():
-        logits = model(**encoded_input).logits[0][1:-1].cpu()
-    index_to_token = {idx: token for token, idx in tokenizer.get_vocab().items()}
-    return pd.DataFrame(
-        logits.numpy() if not probs else torch.softmax(logits, dim=-1).numpy(),
-        columns=[index_to_token[i] for i in range(logits.shape[1])]
-    )[list('ACDEFGHIKLMNPQRSTVWY')]
-seq = "QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS"
-probs = sapiens_predict(
-    tokenizer,
-    vh_model,
-    seq,
-)
-top = probs.idxmax(axis=1)
-print(" Input:", seq)
-print("       ", ''.join('|' if aa == bb else ' ' for aa, bb in zip(seq, top)))
-print("Output:", ''.join(top))
-print("Probs:")
-print(probs.head())

src/streamlit_app.py CHANGED Viewed

@@ -2,97 +2,186 @@ import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
-from transformers import RobertaForMaskedLM, RobertaTokenizer
-import torch
-import torch.nn.functional as F
 st.set_page_config(layout="wide")
-@st.cache_resource
-def get_tokenizer():
-    return RobertaTokenizer.from_pretrained('prihodad/biophi-sapiens1-tokenizer')
-@st.cache_resource
-def get_vh_model():
-    return RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vh')
-@st.cache_resource
-def get_vl_model():
-    return RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vl')
-def sapiens_predict(tokenizer, model, seq, probs=True):
-    encoded_input = tokenizer(seq, return_tensors='pt')
-    with torch.no_grad():
-        logits = model(**encoded_input).logits[0][1:-1].cpu()
-    index_to_token = {idx: token for token, idx in tokenizer.get_vocab().items()}
-    df = pd.DataFrame(
-        logits.numpy() if not probs else torch.softmax(logits, dim=-1).numpy(),
-        index=range(1, len(seq) + 1),
-        columns=[index_to_token[i] for i in range(logits.shape[1])]
-    )[list('ACDEFGHIKLMNPQRSTVWY')]
-    df.index.name = "position"
-    return df
-def display_prediction(probs, seq):
-    top = probs.idxmax(axis=1)
-    out = f" Input:  {seq}" + "\n"
-    out += "        " + "".join(' ' if aa == bb else '#' for aa, bb in zip(seq, top)) + "\n"
-    out += "Output: " + "".join(top)
-    st.code(out, language="text")
-    probs_long = probs.reset_index().melt(id_vars="position", var_name="aa", value_name="probability")
-    heatmap = alt.Chart(probs_long).mark_rect().encode(
-        x=alt.X("position:O", title="Sequence Position"),
-        y=alt.Y("aa:N", title="Amino Acid", sort=None),
-        color=alt.Color("probability:Q", scale=alt.Scale(scheme="viridis")),
-        tooltip=["position", "aa", "probability"]
-    ).properties(
-        height=600,
-        title="Amino Acid Probabilities"
-    )
-    st.altair_chart(heatmap, use_container_width=True)
-st.write('# Sapiens human antibody language model')
-vh = st.text_area("Heavy chain", value="QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS")
-vl = st.text_area("Light chain")
-if len(vh) > 180 or len(vl) > 180:
-    st.error("Please enter only the variable region - got sequence of length > 180")
-    st.stop()
-with st.spinner("Predicting..."):
-    tokenizer = get_tokenizer()
-    vh_model = get_vh_model()
-    vl_model = get_vl_model()
-    st.write("### VH")
-    if vh:
-        vh_probs = sapiens_predict(tokenizer, vh_model, vh)
-        display_prediction(vh_probs, vh)
-    else:
-        st.write("No heavy chain provided")
-    st.write("### VL")
-    if vl:
-        vl_probs = sapiens_predict(tokenizer, vl_model, vl)
-        display_prediction(vl_probs, vl)
-    else:
-        st.write("No light chain provided")
-st.write("## Try it out yourself")
-st.write("Install dependencies `pip install transformers torch` and run:")
-with open("src/example.py") as f:
-    st.code(f.read(), language="python")

 import numpy as np
 import pandas as pd
 import streamlit as st
+from promb import init_db, print_nearest
+from io import StringIO
+import logomaker
 st.set_page_config(layout="wide")
+@st.cache_resource(show_spinner=False)
+def init_db_cached(db_name, *args, **kwargs):
+    return init_db(db_name, *args, **kwargs)
+st.write('# promb - protein humanness evaluation')
+st.code('''pip install promb
+              *           █
+▄▄▄▄    ▄▄▄  ▄█▄   ▄▄▄▄   █▄▄▄
+█   █  █    █▓███  █ █ █  █   █
+█▄▄▄▀  █    ▀███▀  █   █  █▄▄▄▀
+█
+▀       protein mutation burden
+''')
+if st.button("Load example"):
+    st.session_state["seq"] = "SPLQKASDSLINIAIKMLRNGINPELAKKLWDIAYKISMSHIDPSSFYEALKELKKLIEEQEEELIEA"
+with st.form(border=False, key="input"):
+    seq = st.text_area(
+        "Amino acid sequence",
+        key="seq"
+    )
+    seq = "".join(seq.split())
+    left, mid, right = st.columns(3)
+    with left:
+        database_name = st.selectbox("Database", options=["human-reference", "human-swissprot", "human-oas"])
+    with mid:
+        peptide_length = st.number_input("Peptide length", value=9, min_value=2, max_value=20, disabled=database_name == "human-oas")
+    with right:
+        num_nearest = st.number_input("Nearest peptides", value=1, min_value=1, max_value=5, help="Number of nearest human peptides used for visualization and to compute PSSM and suggest humanizing mutations")
+    st.form_submit_button("Run", type="primary")
+if not seq:
+    st.stop()
+with st.spinner(f"Finding nearest {peptide_length}mer peptides in {database_name}..."):
+    db = init_db_cached(database_name, peptide_length if database_name != "human-oas" else None)
+    peptides = db.chop_seq_peptides(seq)
+    nearest = db.find_nearest_peptides(peptides, n=num_nearest)
+st.write("## Result")
+st.metric("Human Peptide Content", "{:.1%} human".format(db.compute_peptide_content(seq)))
+num_mutations = 0
+for peptide, hits in zip(peptides, nearest):
+    num_mutations += sum(aa != bb for aa, bb in zip(peptide, hits[0]))
+st.metric("Mutation Burden", "{:.1f} mutations per {}mer".format(num_mutations / len(peptides), peptide_length))
+likelihood = pd.DataFrame({
+    "likelihood": db.compute_positional_likelihood(seq, nearest_peptides=nearest),
+    "metric": "likelihood",
+    "aa": list(seq),
+    "position": range(1, len(seq)+1)
+})
+wrap = 50
+st.write("### Positional likelihood")
+st.write(f"Fraction of nearest overlapping {peptide_length}mers that contain the input amino acid at that position. Positions with values close to 0 can be considered to be non-human, and values close to 1 to be human.")
+for start in range(0, len(seq), wrap):
+    chunk = likelihood.iloc[start:start+wrap]
+    heatmap = alt.Chart(chunk).mark_rect().encode(
+        x=alt.X("position:O", title="Sequence Position"),
+        y=alt.Y("metric:N", title="Metric"),
+        color=alt.Color("likelihood:Q", scale=alt.Scale(scheme="reds", reverse=True, domain=(0, 1))),
+        tooltip=["position", "likelihood", "aa"]
+    )
+    text = alt.Chart(chunk).mark_text(baseline="middle", fontSize=12).encode(
+        x=alt.X("position:O", title="Sequence Position"),
+        y=alt.Y("metric:N", title="Metric"),
+        text=alt.Text("aa:N"),
+        tooltip=["position", "likelihood", "aa"]
+    )
+    chart = (heatmap + text).properties(
+        width=250 + (15 * len(chunk)),
+        height=180,
+        title=f"Positions {start+1}-{start+wrap}"
+    )
+    st.altair_chart(chart, use_container_width=False)
+st.write("### Position-specific scoring matrix")
+st.write("A PSSM (PWM) computed by counting occurences of amino acids in nearest overlapping human peptides at each position.")
+pssm = db.compute_pssm(seq, nearest_peptides=nearest)
+freqs_long = pssm.reset_index().melt(id_vars="position", var_name="aa", value_name="count")
+heatmap = alt.Chart(freqs_long).mark_rect().encode(
+    x=alt.X("position:O", title="Sequence Position"),
+    y=alt.Y("aa:N", title="Amino Acid", sort=None),
+    color=alt.Color("count:Q", scale=alt.Scale(scheme="viridis")),
+    tooltip=["position", "aa", "count"]
+).properties(
+    height=600,
+    title="Amino Acid Frequencies"
+)
+st.altair_chart(heatmap, use_container_width=True)
+st.write("#### Sequence logo")
+st.write("PSSM computed from nearest human peptides visualized using logomaker library")
+logo = logomaker.Logo(pssm, figsize=(min(50, 2 + 0.2 * len(seq)), 2))
+st.pyplot(logo.fig)
+st.write("#### Suggested mutations")
+st.write("PSSM but without counting amino acids found in input sequence at each position")
+pssm_mutations = db.compute_pssm(seq, nearest_peptides=nearest, ignore_wildtype=True)
+logo = logomaker.Logo(pssm_mutations, figsize=(min(50, 2 + 0.2 * len(seq)), 2))
+st.pyplot(logo.fig)
+st.write("### Nearest human peptides")
+stream = StringIO()
+print_nearest(peptides, nearest, file=stream)
+with st.container(height=400):
+    st.code(stream.getvalue())
+st.write("## Humanization (naive approach)")
+st.write("Generate **slightly** humanized variants by applying 1-3 mutations based on nearest overlapping peptides")
+if st.button("Generate humanized mutants", type="primary"):
+    st.write("### Point mutant candidates")
+    with st.spinner("Generating point mutants..."):
+        for candidate in db.suggest_point_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
+            mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
+            pmb = db.compute_pmb(candidate)
+            st.code(f">{mutations} PMB={pmb:.2f}\n{candidate}")
+    st.write("### Double mutant candidates")
+    with st.spinner("Generating double mutants..."):
+        for candidate in db.suggest_double_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
+            mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
+            pmb = db.compute_pmb(candidate)
+            st.code(f">{mutations} PMB={pmb:.2f}\n{candidate}")
+    st.write("### Triple mutant candidates")
+    with st.spinner("Generating triple mutants..."):
+        for candidate in db.suggest_triple_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
+            mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
+            pmb = db.compute_pmb(candidate)
+            st.code(f">{mutations} PMB={pmb:.2f}\n{candidate}")
+st.divider()
+st.write("## Run locally")
+st.write("Install and run `promb` locally:")
+st.code("""
+# Install promb
+pip install promb
+# See cli commands
+promb --help
+""", language="text")
+st.write("More instructions in the GitHub repo: https://github.com/MSDLLCpapers/promb")
+st.write("You can also clone this space as a git repository and run it locally:")
+st.code("""
+# Clone huggingface spaces repository
+git clone https://huggingface.co/spaces/prihodad/promb-humanness
+# Open the directory
+cd promb-humanness
+# Install dependencies (you should do this in a separate conda/venv environment)
+pip install -r requirements.txt
+# Run Streamlit app
+streamlit run src/streamlit_app.py
+""", language="text")