Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Interactive demonstration of FINCHnmr.
|
3 |
+
|
4 |
+
Author: Nathan A. Mahynski
|
5 |
+
"""
|
6 |
+
import finchnmr
|
7 |
+
import os
|
8 |
+
import shutil
|
9 |
+
import zipfile
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
import streamlit as st
|
13 |
+
|
14 |
+
from datasets import load_dataset
|
15 |
+
from finchnmr import analysis, library, model, substance
|
16 |
+
from streamlit_extras.add_vertical_space import add_vertical_space
|
17 |
+
|
18 |
+
UPLOAD_FOLDER = "uploaded_nmr"
|
19 |
+
|
20 |
+
# ----------------------------- CACHED FUNCTIONS -----------------------------
|
21 |
+
@st.cache_data
|
22 |
+
def build_library():
|
23 |
+
"""Build NMR library from HF."""
|
24 |
+
nmr_dataset = load_dataset(
|
25 |
+
"mahynski/bmrb-hsqc-nmr-1H13C",
|
26 |
+
split="train",
|
27 |
+
token=os.getenv("HF_TOKEN"),
|
28 |
+
trust_remote_code=True,
|
29 |
+
)
|
30 |
+
substances = [
|
31 |
+
finchnmr.substance.Substance(
|
32 |
+
pathname=d["pathname"], name=d["name"], warning="ignore"
|
33 |
+
)
|
34 |
+
for d in nmr_dataset
|
35 |
+
]
|
36 |
+
lib = finchnmr.library.Library(substances)
|
37 |
+
return lib
|
38 |
+
|
39 |
+
|
40 |
+
# @st.cache_data
|
41 |
+
def build_model(_target, _lib, _param_grid, _nmr_model, _model_kw):
|
42 |
+
"""Build model for target."""
|
43 |
+
optimized_models, analyses = finchnmr.model.optimize_models(
|
44 |
+
targets=[_target],
|
45 |
+
nmr_library=_lib,
|
46 |
+
nmr_model=_nmr_model,
|
47 |
+
param_grid=_param_grid,
|
48 |
+
model_kw=_model_kw,
|
49 |
+
)
|
50 |
+
return optimized_models, analyses
|
51 |
+
|
52 |
+
|
53 |
+
# --------------------------------- SIDEBAR ----------------------------------
|
54 |
+
st.set_page_config(layout="wide")
|
55 |
+
st.header("Analyze an HSQC NMR Spectra with FINCHnmr")
|
56 |
+
st.logo(
|
57 |
+
"docs/_static/logo_small.png",
|
58 |
+
size="large",
|
59 |
+
link="https://finchnmr.readthedocs.io/",
|
60 |
+
)
|
61 |
+
|
62 |
+
with st.sidebar:
|
63 |
+
st.image("docs/_static/logo_small.png")
|
64 |
+
st.markdown(
|
65 |
+
"""
|
66 |
+
## About this application
|
67 |
+
:heavy_check_mark: This tool is intended to demonstrate the use of [finchnmr](https://github.com/mahynski/finchnmr) to characterize the composition of mixture of compounds.
|
68 |
+
|
69 |
+
:x: It is not intended to be used in production. Instead, use the Jupyter notebooks provided in the [finchnmr documentation](https://finchnmr.readthedocs.io/en/latest/index.html) for reproducible, high-quality analysis.
|
70 |
+
|
71 |
+
This tool is provided "as-is" without warranty. See our [License](https://github.com/mahynski/finchnmr/blob/a9c3504ea012fbd2452218fb2cd6924972bb88dc/LICENSE.md) for more details.
|
72 |
+
"""
|
73 |
+
)
|
74 |
+
|
75 |
+
add_vertical_space(1)
|
76 |
+
st.write("Made by ***Nate Mahynski***")
|
77 |
+
st.write("[email protected]")
|
78 |
+
|
79 |
+
with st.popover("Example Upload Directory"):
|
80 |
+
st.text(
|
81 |
+
"example/\n├── acqu\n├── acqu2\n├── acqu2s\n├── acqus\n├── audita.txt\n├── cpdprg2\n├── format.temp\n├── fq1list\n├── pdata\n│ └── 1\n│ ├── 2ii\n│ ├── 2ir\n│ ├── 2ri\n│ ├── 2rr\n│ ├── assocs\n│ ├── auditp.txt\n│ ├── clevels\n│ ├── curdat2\n│ ├── outd\n│ ├── proc\n│ ├── proc2\n│ ├── proc2s\n│ ├── procs\n│ ├── thumb.png\n│ └── title\n├── prosol_History\n├── pulseprogram\n├── scon2\n├── ser\n├── specpar\n├── spnam14\n├── spnam3\n├── spnam31\n├── spnam7\n├── uxnmr.info\n└── uxnmr.par\n"
|
82 |
+
)
|
83 |
+
|
84 |
+
# ----------------------------------- MAIN -----------------------------------
|
85 |
+
uploaded_file = st.file_uploader(
|
86 |
+
label="Upload a directory output by a Bruker HSQC NMR instrument to start. This should be provided as .zip file. Refer to the dropdown above for an example of the directory structure which should be provided, e.g., as example.zip.",
|
87 |
+
type=["zip"],
|
88 |
+
accept_multiple_files=False,
|
89 |
+
key=None,
|
90 |
+
help="",
|
91 |
+
on_change=None,
|
92 |
+
label_visibility="visible",
|
93 |
+
)
|
94 |
+
|
95 |
+
if uploaded_file is not None:
|
96 |
+
if os.path.isdir(f"./{UPLOAD_FOLDER}/"):
|
97 |
+
shutil.rmtree(f"./{UPLOAD_FOLDER}/")
|
98 |
+
|
99 |
+
with zipfile.ZipFile(uploaded_file, "r") as z:
|
100 |
+
z.extractall(f"./{UPLOAD_FOLDER}/")
|
101 |
+
|
102 |
+
head = os.listdir(f"./{UPLOAD_FOLDER}/")
|
103 |
+
if len(head) != 1:
|
104 |
+
raise Exception("Uploaded zip file should contain exactly 1 folder.")
|
105 |
+
else:
|
106 |
+
head = head[0]
|
107 |
+
|
108 |
+
# Create substance
|
109 |
+
target = finchnmr.substance.Substance(
|
110 |
+
pathname=os.path.abspath(f"./{UPLOAD_FOLDER}/{head}/pdata/1"),
|
111 |
+
name=head,
|
112 |
+
warning="ignore",
|
113 |
+
)
|
114 |
+
|
115 |
+
optimized_models = []
|
116 |
+
|
117 |
+
tab1_, tab2_ = st.tabs(["Configure Model", "Analyze Results"])
|
118 |
+
with tab1_:
|
119 |
+
st.subheader("Configure Model")
|
120 |
+
|
121 |
+
col1_, col2_ = st.columns(2)
|
122 |
+
|
123 |
+
with col1_:
|
124 |
+
# Plot the substance with plotly
|
125 |
+
cmap_option = st.selectbox(
|
126 |
+
"Colormap",
|
127 |
+
("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
|
128 |
+
index=0,
|
129 |
+
)
|
130 |
+
st.plotly_chart(
|
131 |
+
target.plot(
|
132 |
+
absolute_values=True, backend="plotly", cmap=cmap_option
|
133 |
+
)
|
134 |
+
)
|
135 |
+
|
136 |
+
with col2_:
|
137 |
+
# Load reference library from HF
|
138 |
+
with st.spinner(
|
139 |
+
text="Building HSQC Library from [HuggingFace](https://huggingface.co/datasets/mahynski/bmrb-hsqc-nmr-1H13C) (this can take a minute the first time)..."
|
140 |
+
):
|
141 |
+
lib = build_library()
|
142 |
+
st.success("Library has been built and cached!")
|
143 |
+
|
144 |
+
# Select model type
|
145 |
+
model_ = st.selectbox(
|
146 |
+
label="Choose a model", options=["Lasso"], index=0
|
147 |
+
)
|
148 |
+
|
149 |
+
if model_:
|
150 |
+
nmr_model = None
|
151 |
+
param_grid = {}
|
152 |
+
model_kw = {}
|
153 |
+
|
154 |
+
# Set parameters and model kwargs
|
155 |
+
with st.form(key="model_settings"):
|
156 |
+
if model_.lower() == "lasso":
|
157 |
+
nmr_model = finchnmr.model.LASSO
|
158 |
+
|
159 |
+
# Set of alphas to check
|
160 |
+
st.write("Hyperparameters")
|
161 |
+
start_alpha_ = st.number_input(
|
162 |
+
label="Smallest alpha (log base)",
|
163 |
+
min_value=-16,
|
164 |
+
max_value=16,
|
165 |
+
value="min",
|
166 |
+
step=1,
|
167 |
+
)
|
168 |
+
stop_alpha_ = st.number_input(
|
169 |
+
label="Largest alpha (log base)",
|
170 |
+
min_value=-16,
|
171 |
+
max_value=16,
|
172 |
+
value=0,
|
173 |
+
step=1,
|
174 |
+
)
|
175 |
+
n_ = st.slider(
|
176 |
+
label="Number of alpha values in logscale",
|
177 |
+
min_value=1,
|
178 |
+
max_value=100,
|
179 |
+
value=1,
|
180 |
+
step=1,
|
181 |
+
)
|
182 |
+
param_grid = {
|
183 |
+
"alpha": np.logspace(
|
184 |
+
start_alpha_, stop_alpha_, int(n_)
|
185 |
+
)
|
186 |
+
} # Select a range of alpha values to examine sparsity
|
187 |
+
|
188 |
+
# Lasso configuration
|
189 |
+
st.divider()
|
190 |
+
st.write("Model Configuration")
|
191 |
+
max_iter_ = st.number_input(
|
192 |
+
label="Max number of iterations to converge, see [Lasso documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)",
|
193 |
+
min_value=1,
|
194 |
+
max_value=100000,
|
195 |
+
value=1000,
|
196 |
+
step=1,
|
197 |
+
)
|
198 |
+
selection_ = st.selectbox(
|
199 |
+
label="Selection scheme, see [Lasso documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)",
|
200 |
+
options=["selection", "random"],
|
201 |
+
index=0,
|
202 |
+
)
|
203 |
+
tol_ = st.number_input(
|
204 |
+
label="Convergence tolerance, see [Lasso documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)",
|
205 |
+
min_value=None,
|
206 |
+
max_value=None,
|
207 |
+
value=0.0001,
|
208 |
+
format="%0.4f",
|
209 |
+
step=0.0001,
|
210 |
+
)
|
211 |
+
model_kw = {
|
212 |
+
"max_iter": int(max_iter_),
|
213 |
+
"selection": selection_,
|
214 |
+
"random_state": 42,
|
215 |
+
"tol": tol_,
|
216 |
+
}
|
217 |
+
|
218 |
+
submit_button = st.form_submit_button(
|
219 |
+
"Start Building Model", icon=":material/start:"
|
220 |
+
)
|
221 |
+
|
222 |
+
# Build the model
|
223 |
+
if submit_button:
|
224 |
+
stop_btn = st.button(
|
225 |
+
"Stop Building Model",
|
226 |
+
type="primary",
|
227 |
+
icon=":material/block:",
|
228 |
+
)
|
229 |
+
with st.spinner(text="Building..."):
|
230 |
+
optimized_models, analyses = build_model(
|
231 |
+
_target=target,
|
232 |
+
_lib=lib,
|
233 |
+
_param_grid=param_grid,
|
234 |
+
_nmr_model=nmr_model,
|
235 |
+
_model_kw=model_kw,
|
236 |
+
)
|
237 |
+
st.success("Model has been built and cached!", icon="✅")
|
238 |
+
|
239 |
+
# Now present the analysis / results
|
240 |
+
with tab2_:
|
241 |
+
# import pickle
|
242 |
+
|
243 |
+
# optimized_models = [
|
244 |
+
# pickle.load(open("streamlit/example_model.pkl", "rb"))
|
245 |
+
# ] # TEMP
|
246 |
+
# analyses = [
|
247 |
+
# pickle.load(open("streamlit/example_analysis.pkl", "rb"))
|
248 |
+
# ] # TEMP
|
249 |
+
|
250 |
+
if len(optimized_models) > 0:
|
251 |
+
st.subheader(
|
252 |
+
"Observe how well the model fits the original spectrum."
|
253 |
+
)
|
254 |
+
|
255 |
+
model_ = optimized_models[0] # We only fit one model
|
256 |
+
analysis_ = analyses[0]
|
257 |
+
|
258 |
+
# Plot original vs. reconstructed and residual
|
259 |
+
col3_, col4_ = st.columns(2)
|
260 |
+
with col3_:
|
261 |
+
# Plot the substance with plotly
|
262 |
+
cmap_option3 = st.selectbox(
|
263 |
+
"Colormap",
|
264 |
+
("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
|
265 |
+
index=0,
|
266 |
+
key="compare_orig",
|
267 |
+
)
|
268 |
+
st.plotly_chart(
|
269 |
+
target.plot(
|
270 |
+
absolute_values=True,
|
271 |
+
backend="plotly",
|
272 |
+
cmap=cmap_option3,
|
273 |
+
title="Original Spectrum",
|
274 |
+
),
|
275 |
+
key="compare_orig_plot",
|
276 |
+
)
|
277 |
+
|
278 |
+
with col4_:
|
279 |
+
cmap_option4 = st.selectbox(
|
280 |
+
"Colormap",
|
281 |
+
("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
|
282 |
+
index=0,
|
283 |
+
key="compare_recon",
|
284 |
+
)
|
285 |
+
st.plotly_chart(
|
286 |
+
model_.reconstruct().plot(
|
287 |
+
absolute_values=True,
|
288 |
+
backend="plotly",
|
289 |
+
cmap=cmap_option4,
|
290 |
+
title="Model Reconstruction",
|
291 |
+
),
|
292 |
+
key="compare_recon_plot",
|
293 |
+
)
|
294 |
+
|
295 |
+
col5_, col6_ = st.columns(2)
|
296 |
+
with col5_:
|
297 |
+
cmap_option5 = st.selectbox(
|
298 |
+
"Colormap",
|
299 |
+
("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
|
300 |
+
index=0,
|
301 |
+
key="compare_resid",
|
302 |
+
)
|
303 |
+
st.plotly_chart(
|
304 |
+
analysis_.build_residual().plot(
|
305 |
+
absolute_values=True,
|
306 |
+
backend="plotly",
|
307 |
+
cmap=cmap_option5,
|
308 |
+
title="Residual",
|
309 |
+
),
|
310 |
+
key="compare_resid_plot",
|
311 |
+
)
|
312 |
+
|
313 |
+
# Plot most important spectra
|
314 |
+
max_n_ = len(analysis_._model.importances())
|
315 |
+
default_n_ = np.min([10, max_n_])
|
316 |
+
with col6_:
|
317 |
+
n_imp_ = st.slider(
|
318 |
+
label="Visualize the most important N spectra in the library",
|
319 |
+
value=default_n_,
|
320 |
+
min_value=1,
|
321 |
+
max_value=max_n_,
|
322 |
+
step=1,
|
323 |
+
)
|
324 |
+
st.plotly_chart(
|
325 |
+
analysis_.plot_top_importances(
|
326 |
+
k=n_imp_, by_name=True, backend="plotly"
|
327 |
+
),
|
328 |
+
use_container_width=True,
|
329 |
+
)
|
330 |
+
|
331 |
+
st.divider()
|
332 |
+
st.subheader(
|
333 |
+
"Visualize the most important substances from the library used"
|
334 |
+
)
|
335 |
+
|
336 |
+
# Now plot the important spectra themselves
|
337 |
+
top_substances, top_importances = analysis_.get_top_substances(
|
338 |
+
k=n_imp_
|
339 |
+
)
|
340 |
+
n_cols_ = st.slider(
|
341 |
+
label="Number of columns",
|
342 |
+
value=3,
|
343 |
+
min_value=1,
|
344 |
+
max_value=n_imp_,
|
345 |
+
step=1,
|
346 |
+
)
|
347 |
+
n_rows_ = np.max([1, int(np.ceil(n_imp_ / n_cols_))])
|
348 |
+
ctr = 0
|
349 |
+
for row_idx in range(n_rows_):
|
350 |
+
for col_, col_idx in zip(st.columns(n_cols_), range(n_cols_)):
|
351 |
+
with col_:
|
352 |
+
if ctr < n_imp_:
|
353 |
+
cmap_option_ = st.selectbox(
|
354 |
+
"Colormap",
|
355 |
+
("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
|
356 |
+
index=0,
|
357 |
+
key=f"cmap_option_{ctr}_",
|
358 |
+
)
|
359 |
+
st.plotly_chart(
|
360 |
+
top_substances[ctr].plot(
|
361 |
+
absolute_values=True,
|
362 |
+
backend="plotly",
|
363 |
+
cmap=cmap_option_,
|
364 |
+
)
|
365 |
+
)
|
366 |
+
ctr += 1
|