Spaces:
Running
Running
David Prihoda
commited on
Commit
·
5500725
1
Parent(s):
fb77734
Promb
Browse files- Dockerfile +0 -2
- requirements.txt +2 -2
- src/example.py +0 -34
- src/streamlit_app.py +156 -67
Dockerfile
CHANGED
@@ -14,8 +14,6 @@ COPY src/ ./src/
|
|
14 |
|
15 |
RUN pip3 install -r requirements.txt
|
16 |
|
17 |
-
ENV HF_HUB_CACHE="/tmp/huggingface"
|
18 |
-
|
19 |
EXPOSE 8501
|
20 |
|
21 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
|
|
14 |
|
15 |
RUN pip3 install -r requirements.txt
|
16 |
|
|
|
|
|
17 |
EXPOSE 8501
|
18 |
|
19 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
altair
|
2 |
pandas
|
3 |
streamlit
|
4 |
-
|
5 |
-
|
|
|
1 |
altair
|
2 |
pandas
|
3 |
streamlit
|
4 |
+
logomaker
|
5 |
+
promb
|
src/example.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import pandas as pd
|
3 |
-
from transformers import RobertaForMaskedLM, RobertaTokenizer
|
4 |
-
|
5 |
-
tokenizer = RobertaTokenizer.from_pretrained('prihodad/biophi-sapiens1-tokenizer')
|
6 |
-
vh_model = RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vh')
|
7 |
-
vl_model = RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vl')
|
8 |
-
|
9 |
-
def sapiens_predict(tokenizer, model, seq, probs=True):
|
10 |
-
encoded_input = tokenizer(seq, return_tensors='pt')
|
11 |
-
with torch.no_grad():
|
12 |
-
logits = model(**encoded_input).logits[0][1:-1].cpu()
|
13 |
-
|
14 |
-
index_to_token = {idx: token for token, idx in tokenizer.get_vocab().items()}
|
15 |
-
return pd.DataFrame(
|
16 |
-
logits.numpy() if not probs else torch.softmax(logits, dim=-1).numpy(),
|
17 |
-
columns=[index_to_token[i] for i in range(logits.shape[1])]
|
18 |
-
)[list('ACDEFGHIKLMNPQRSTVWY')]
|
19 |
-
|
20 |
-
seq = "QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS"
|
21 |
-
probs = sapiens_predict(
|
22 |
-
tokenizer,
|
23 |
-
vh_model,
|
24 |
-
seq,
|
25 |
-
)
|
26 |
-
|
27 |
-
top = probs.idxmax(axis=1)
|
28 |
-
|
29 |
-
print(" Input:", seq)
|
30 |
-
print(" ", ''.join('|' if aa == bb else ' ' for aa, bb in zip(seq, top)))
|
31 |
-
print("Output:", ''.join(top))
|
32 |
-
|
33 |
-
print("Probs:")
|
34 |
-
print(probs.head())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/streamlit_app.py
CHANGED
@@ -2,97 +2,186 @@ import altair as alt
|
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
5 |
-
from
|
6 |
-
import
|
7 |
-
import
|
8 |
|
9 |
st.set_page_config(layout="wide")
|
10 |
|
11 |
-
@st.cache_resource
|
12 |
-
def
|
13 |
-
return
|
14 |
|
15 |
|
16 |
-
|
17 |
-
def get_vh_model():
|
18 |
-
return RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vh')
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
return RobertaForMaskedLM.from_pretrained('prihodad/biophi-sapiens1-vl')
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
def sapiens_predict(tokenizer, model, seq, probs=True):
|
27 |
-
encoded_input = tokenizer(seq, return_tensors='pt')
|
28 |
-
with torch.no_grad():
|
29 |
-
logits = model(**encoded_input).logits[0][1:-1].cpu()
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
return df
|
39 |
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
|
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
y=alt.Y("aa:N", title="Amino Acid", sort=None),
|
55 |
-
color=alt.Color("probability:Q", scale=alt.Scale(scheme="viridis")),
|
56 |
-
tooltip=["position", "aa", "probability"]
|
57 |
-
).properties(
|
58 |
-
height=600,
|
59 |
-
title="Amino Acid Probabilities"
|
60 |
-
)
|
61 |
-
st.altair_chart(heatmap, use_container_width=True)
|
62 |
|
|
|
|
|
63 |
|
64 |
-
|
|
|
|
|
65 |
|
66 |
-
|
67 |
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
tokenizer = get_tokenizer()
|
76 |
-
vh_model = get_vh_model()
|
77 |
-
vl_model = get_vl_model()
|
78 |
|
79 |
-
|
80 |
-
if vh:
|
81 |
-
vh_probs = sapiens_predict(tokenizer, vh_model, vh)
|
82 |
-
display_prediction(vh_probs, vh)
|
83 |
-
else:
|
84 |
-
st.write("No heavy chain provided")
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
|
93 |
-
st.write("
|
94 |
|
95 |
-
st.write("
|
96 |
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
5 |
+
from promb import init_db, print_nearest
|
6 |
+
from io import StringIO
|
7 |
+
import logomaker
|
8 |
|
9 |
st.set_page_config(layout="wide")
|
10 |
|
11 |
+
@st.cache_resource(show_spinner=False)
|
12 |
+
def init_db_cached(db_name, *args, **kwargs):
|
13 |
+
return init_db(db_name, *args, **kwargs)
|
14 |
|
15 |
|
16 |
+
st.write('# promb - protein humanness evaluation')
|
|
|
|
|
17 |
|
18 |
+
st.code('''pip install promb
|
19 |
+
* █
|
20 |
+
▄▄▄▄ ▄▄▄ ▄█▄ ▄▄▄▄ █▄▄▄
|
21 |
+
█ █ █ █▓███ █ █ █ █ █
|
22 |
+
█▄▄▄▀ █ ▀███▀ █ █ █▄▄▄▀
|
23 |
+
█
|
24 |
+
▀ protein mutation burden
|
25 |
+
''')
|
26 |
|
27 |
+
if st.button("Load example"):
|
28 |
+
st.session_state["seq"] = "SPLQKASDSLINIAIKMLRNGINPELAKKLWDIAYKISMSHIDPSSFYEALKELKKLIEEQEEELIEA"
|
|
|
29 |
|
30 |
+
with st.form(border=False, key="input"):
|
31 |
+
seq = st.text_area(
|
32 |
+
"Amino acid sequence",
|
33 |
+
key="seq"
|
34 |
+
)
|
35 |
+
seq = "".join(seq.split())
|
36 |
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
left, mid, right = st.columns(3)
|
39 |
+
with left:
|
40 |
+
database_name = st.selectbox("Database", options=["human-reference", "human-swissprot", "human-oas"])
|
41 |
+
with mid:
|
42 |
+
peptide_length = st.number_input("Peptide length", value=9, min_value=2, max_value=20, disabled=database_name == "human-oas")
|
43 |
+
with right:
|
44 |
+
num_nearest = st.number_input("Nearest peptides", value=1, min_value=1, max_value=5, help="Number of nearest human peptides used for visualization and to compute PSSM and suggest humanizing mutations")
|
|
|
45 |
|
46 |
+
st.form_submit_button("Run", type="primary")
|
47 |
|
48 |
+
if not seq:
|
49 |
+
st.stop()
|
50 |
|
51 |
+
with st.spinner(f"Finding nearest {peptide_length}mer peptides in {database_name}..."):
|
52 |
+
db = init_db_cached(database_name, peptide_length if database_name != "human-oas" else None)
|
53 |
+
peptides = db.chop_seq_peptides(seq)
|
54 |
+
nearest = db.find_nearest_peptides(peptides, n=num_nearest)
|
55 |
+
|
56 |
+
st.write("## Result")
|
57 |
+
|
58 |
+
st.metric("Human Peptide Content", "{:.1%} human".format(db.compute_peptide_content(seq)))
|
59 |
+
num_mutations = 0
|
60 |
+
for peptide, hits in zip(peptides, nearest):
|
61 |
+
num_mutations += sum(aa != bb for aa, bb in zip(peptide, hits[0]))
|
62 |
+
|
63 |
+
st.metric("Mutation Burden", "{:.1f} mutations per {}mer".format(num_mutations / len(peptides), peptide_length))
|
64 |
+
|
65 |
+
likelihood = pd.DataFrame({
|
66 |
+
"likelihood": db.compute_positional_likelihood(seq, nearest_peptides=nearest),
|
67 |
+
"metric": "likelihood",
|
68 |
+
"aa": list(seq),
|
69 |
+
"position": range(1, len(seq)+1)
|
70 |
+
})
|
71 |
+
|
72 |
+
wrap = 50
|
73 |
+
st.write("### Positional likelihood")
|
74 |
+
st.write(f"Fraction of nearest overlapping {peptide_length}mers that contain the input amino acid at that position. Positions with values close to 0 can be considered to be non-human, and values close to 1 to be human.")
|
75 |
+
for start in range(0, len(seq), wrap):
|
76 |
+
chunk = likelihood.iloc[start:start+wrap]
|
77 |
+
heatmap = alt.Chart(chunk).mark_rect().encode(
|
78 |
+
x=alt.X("position:O", title="Sequence Position"),
|
79 |
+
y=alt.Y("metric:N", title="Metric"),
|
80 |
+
color=alt.Color("likelihood:Q", scale=alt.Scale(scheme="reds", reverse=True, domain=(0, 1))),
|
81 |
+
tooltip=["position", "likelihood", "aa"]
|
82 |
+
)
|
83 |
+
text = alt.Chart(chunk).mark_text(baseline="middle", fontSize=12).encode(
|
84 |
+
x=alt.X("position:O", title="Sequence Position"),
|
85 |
+
y=alt.Y("metric:N", title="Metric"),
|
86 |
+
text=alt.Text("aa:N"),
|
87 |
+
tooltip=["position", "likelihood", "aa"]
|
88 |
+
)
|
89 |
+
chart = (heatmap + text).properties(
|
90 |
+
width=250 + (15 * len(chunk)),
|
91 |
+
height=180,
|
92 |
+
title=f"Positions {start+1}-{start+wrap}"
|
93 |
+
)
|
94 |
+
st.altair_chart(chart, use_container_width=False)
|
95 |
|
96 |
+
st.write("### Position-specific scoring matrix")
|
97 |
+
st.write("A PSSM (PWM) computed by counting occurences of amino acids in nearest overlapping human peptides at each position.")
|
98 |
+
pssm = db.compute_pssm(seq, nearest_peptides=nearest)
|
99 |
+
freqs_long = pssm.reset_index().melt(id_vars="position", var_name="aa", value_name="count")
|
100 |
+
heatmap = alt.Chart(freqs_long).mark_rect().encode(
|
101 |
+
x=alt.X("position:O", title="Sequence Position"),
|
102 |
+
y=alt.Y("aa:N", title="Amino Acid", sort=None),
|
103 |
+
color=alt.Color("count:Q", scale=alt.Scale(scheme="viridis")),
|
104 |
+
tooltip=["position", "aa", "count"]
|
105 |
+
).properties(
|
106 |
+
height=600,
|
107 |
+
title="Amino Acid Frequencies"
|
108 |
+
)
|
109 |
+
st.altair_chart(heatmap, use_container_width=True)
|
110 |
|
111 |
+
st.write("#### Sequence logo")
|
112 |
+
st.write("PSSM computed from nearest human peptides visualized using logomaker library")
|
113 |
|
114 |
+
logo = logomaker.Logo(pssm, figsize=(min(50, 2 + 0.2 * len(seq)), 2))
|
115 |
+
st.pyplot(logo.fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
st.write("#### Suggested mutations")
|
118 |
+
st.write("PSSM but without counting amino acids found in input sequence at each position")
|
119 |
|
120 |
+
pssm_mutations = db.compute_pssm(seq, nearest_peptides=nearest, ignore_wildtype=True)
|
121 |
+
logo = logomaker.Logo(pssm_mutations, figsize=(min(50, 2 + 0.2 * len(seq)), 2))
|
122 |
+
st.pyplot(logo.fig)
|
123 |
|
124 |
+
st.write("### Nearest human peptides")
|
125 |
|
126 |
+
stream = StringIO()
|
127 |
+
print_nearest(peptides, nearest, file=stream)
|
128 |
+
with st.container(height=400):
|
129 |
+
st.code(stream.getvalue())
|
130 |
|
131 |
+
st.write("## Humanization (naive approach)")
|
132 |
+
|
133 |
+
st.write("Generate **slightly** humanized variants by applying 1-3 mutations based on nearest overlapping peptides")
|
134 |
+
|
135 |
+
if st.button("Generate humanized mutants", type="primary"):
|
136 |
+
|
137 |
+
st.write("### Point mutant candidates")
|
138 |
+
|
139 |
+
with st.spinner("Generating point mutants..."):
|
140 |
+
for candidate in db.suggest_point_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
|
141 |
+
mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
|
142 |
+
pmb = db.compute_pmb(candidate)
|
143 |
+
st.code(f">{mutations} PMB={pmb:.2f}\n{candidate}")
|
144 |
+
|
145 |
+
st.write("### Double mutant candidates")
|
146 |
+
|
147 |
+
with st.spinner("Generating double mutants..."):
|
148 |
+
for candidate in db.suggest_double_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
|
149 |
+
mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
|
150 |
+
pmb = db.compute_pmb(candidate)
|
151 |
+
st.code(f">{mutations} PMB={pmb:.2f}\n{candidate}")
|
152 |
+
|
153 |
+
st.write("### Triple mutant candidates")
|
154 |
+
|
155 |
+
with st.spinner("Generating triple mutants..."):
|
156 |
+
for candidate in db.suggest_triple_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
|
157 |
+
mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
|
158 |
+
pmb = db.compute_pmb(candidate)
|
159 |
+
st.code(f">{mutations} PMB={pmb:.2f}\n{candidate}")
|
160 |
+
|
161 |
+
st.divider()
|
162 |
|
163 |
+
st.write("## Run locally")
|
|
|
|
|
|
|
164 |
|
165 |
+
st.write("Install and run `promb` locally:")
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
+
st.code("""
|
168 |
+
# Install promb
|
169 |
+
pip install promb
|
170 |
+
# See cli commands
|
171 |
+
promb --help
|
172 |
+
""", language="text")
|
173 |
|
174 |
+
st.write("More instructions in the GitHub repo: https://github.com/MSDLLCpapers/promb")
|
175 |
|
176 |
+
st.write("You can also clone this space as a git repository and run it locally:")
|
177 |
|
178 |
+
st.code("""
|
179 |
+
# Clone huggingface spaces repository
|
180 |
+
git clone https://huggingface.co/spaces/prihodad/promb-humanness
|
181 |
+
# Open the directory
|
182 |
+
cd promb-humanness
|
183 |
+
# Install dependencies (you should do this in a separate conda/venv environment)
|
184 |
+
pip install -r requirements.txt
|
185 |
+
# Run Streamlit app
|
186 |
+
streamlit run src/streamlit_app.py
|
187 |
+
""", language="text")
|