mahynski commited on
Commit
5cc7389
·
verified ·
1 Parent(s): af6d76b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +366 -0
app.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interactive demonstration of FINCHnmr.
3
+
4
+ Author: Nathan A. Mahynski
5
+ """
6
+ import finchnmr
7
+ import os
8
+ import shutil
9
+ import zipfile
10
+
11
+ import numpy as np
12
+ import streamlit as st
13
+
14
+ from datasets import load_dataset
15
+ from finchnmr import analysis, library, model, substance
16
+ from streamlit_extras.add_vertical_space import add_vertical_space
17
+
18
+ UPLOAD_FOLDER = "uploaded_nmr"
19
+
20
+ # ----------------------------- CACHED FUNCTIONS -----------------------------
21
+ @st.cache_data
22
+ def build_library():
23
+ """Build NMR library from HF."""
24
+ nmr_dataset = load_dataset(
25
+ "mahynski/bmrb-hsqc-nmr-1H13C",
26
+ split="train",
27
+ token=os.getenv("HF_TOKEN"),
28
+ trust_remote_code=True,
29
+ )
30
+ substances = [
31
+ finchnmr.substance.Substance(
32
+ pathname=d["pathname"], name=d["name"], warning="ignore"
33
+ )
34
+ for d in nmr_dataset
35
+ ]
36
+ lib = finchnmr.library.Library(substances)
37
+ return lib
38
+
39
+
40
+ # @st.cache_data
41
+ def build_model(_target, _lib, _param_grid, _nmr_model, _model_kw):
42
+ """Build model for target."""
43
+ optimized_models, analyses = finchnmr.model.optimize_models(
44
+ targets=[_target],
45
+ nmr_library=_lib,
46
+ nmr_model=_nmr_model,
47
+ param_grid=_param_grid,
48
+ model_kw=_model_kw,
49
+ )
50
+ return optimized_models, analyses
51
+
52
+
53
+ # --------------------------------- SIDEBAR ----------------------------------
54
+ st.set_page_config(layout="wide")
55
+ st.header("Analyze an HSQC NMR Spectra with FINCHnmr")
56
+ st.logo(
57
+ "docs/_static/logo_small.png",
58
+ size="large",
59
+ link="https://finchnmr.readthedocs.io/",
60
+ )
61
+
62
+ with st.sidebar:
63
+ st.image("docs/_static/logo_small.png")
64
+ st.markdown(
65
+ """
66
+ ## About this application
67
+ :heavy_check_mark: This tool is intended to demonstrate the use of [finchnmr](https://github.com/mahynski/finchnmr) to characterize the composition of mixture of compounds.
68
+
69
+ :x: It is not intended to be used in production. Instead, use the Jupyter notebooks provided in the [finchnmr documentation](https://finchnmr.readthedocs.io/en/latest/index.html) for reproducible, high-quality analysis.
70
+
71
+ This tool is provided "as-is" without warranty. See our [License](https://github.com/mahynski/finchnmr/blob/a9c3504ea012fbd2452218fb2cd6924972bb88dc/LICENSE.md) for more details.
72
+ """
73
+ )
74
+
75
+ add_vertical_space(1)
76
+ st.write("Made by ***Nate Mahynski***")
77
+ st.write("[email protected]")
78
+
79
+ with st.popover("Example Upload Directory"):
80
+ st.text(
81
+ "example/\n├── acqu\n├── acqu2\n├── acqu2s\n├── acqus\n├── audita.txt\n├── cpdprg2\n├── format.temp\n├── fq1list\n├── pdata\n│       └── 1\n│                  ├── 2ii\n│                  ├── 2ir\n│                  ├── 2ri\n│                  ├── 2rr\n│                  ├── assocs\n│                  ├── auditp.txt\n│                  ├── clevels\n│                  ├── curdat2\n│                  ├── outd\n│                  ├── proc\n│                  ├── proc2\n│                  ├── proc2s\n│                  ├── procs\n│                  ├── thumb.png\n│                  └── title\n├── prosol_History\n├── pulseprogram\n├── scon2\n├── ser\n├── specpar\n├── spnam14\n├── spnam3\n├── spnam31\n├── spnam7\n├── uxnmr.info\n└── uxnmr.par\n"
82
+ )
83
+
84
+ # ----------------------------------- MAIN -----------------------------------
85
+ uploaded_file = st.file_uploader(
86
+ label="Upload a directory output by a Bruker HSQC NMR instrument to start. This should be provided as .zip file. Refer to the dropdown above for an example of the directory structure which should be provided, e.g., as example.zip.",
87
+ type=["zip"],
88
+ accept_multiple_files=False,
89
+ key=None,
90
+ help="",
91
+ on_change=None,
92
+ label_visibility="visible",
93
+ )
94
+
95
+ if uploaded_file is not None:
96
+ if os.path.isdir(f"./{UPLOAD_FOLDER}/"):
97
+ shutil.rmtree(f"./{UPLOAD_FOLDER}/")
98
+
99
+ with zipfile.ZipFile(uploaded_file, "r") as z:
100
+ z.extractall(f"./{UPLOAD_FOLDER}/")
101
+
102
+ head = os.listdir(f"./{UPLOAD_FOLDER}/")
103
+ if len(head) != 1:
104
+ raise Exception("Uploaded zip file should contain exactly 1 folder.")
105
+ else:
106
+ head = head[0]
107
+
108
+ # Create substance
109
+ target = finchnmr.substance.Substance(
110
+ pathname=os.path.abspath(f"./{UPLOAD_FOLDER}/{head}/pdata/1"),
111
+ name=head,
112
+ warning="ignore",
113
+ )
114
+
115
+ optimized_models = []
116
+
117
+ tab1_, tab2_ = st.tabs(["Configure Model", "Analyze Results"])
118
+ with tab1_:
119
+ st.subheader("Configure Model")
120
+
121
+ col1_, col2_ = st.columns(2)
122
+
123
+ with col1_:
124
+ # Plot the substance with plotly
125
+ cmap_option = st.selectbox(
126
+ "Colormap",
127
+ ("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
128
+ index=0,
129
+ )
130
+ st.plotly_chart(
131
+ target.plot(
132
+ absolute_values=True, backend="plotly", cmap=cmap_option
133
+ )
134
+ )
135
+
136
+ with col2_:
137
+ # Load reference library from HF
138
+ with st.spinner(
139
+ text="Building HSQC Library from [HuggingFace](https://huggingface.co/datasets/mahynski/bmrb-hsqc-nmr-1H13C) (this can take a minute the first time)..."
140
+ ):
141
+ lib = build_library()
142
+ st.success("Library has been built and cached!")
143
+
144
+ # Select model type
145
+ model_ = st.selectbox(
146
+ label="Choose a model", options=["Lasso"], index=0
147
+ )
148
+
149
+ if model_:
150
+ nmr_model = None
151
+ param_grid = {}
152
+ model_kw = {}
153
+
154
+ # Set parameters and model kwargs
155
+ with st.form(key="model_settings"):
156
+ if model_.lower() == "lasso":
157
+ nmr_model = finchnmr.model.LASSO
158
+
159
+ # Set of alphas to check
160
+ st.write("Hyperparameters")
161
+ start_alpha_ = st.number_input(
162
+ label="Smallest alpha (log base)",
163
+ min_value=-16,
164
+ max_value=16,
165
+ value="min",
166
+ step=1,
167
+ )
168
+ stop_alpha_ = st.number_input(
169
+ label="Largest alpha (log base)",
170
+ min_value=-16,
171
+ max_value=16,
172
+ value=0,
173
+ step=1,
174
+ )
175
+ n_ = st.slider(
176
+ label="Number of alpha values in logscale",
177
+ min_value=1,
178
+ max_value=100,
179
+ value=1,
180
+ step=1,
181
+ )
182
+ param_grid = {
183
+ "alpha": np.logspace(
184
+ start_alpha_, stop_alpha_, int(n_)
185
+ )
186
+ } # Select a range of alpha values to examine sparsity
187
+
188
+ # Lasso configuration
189
+ st.divider()
190
+ st.write("Model Configuration")
191
+ max_iter_ = st.number_input(
192
+ label="Max number of iterations to converge, see [Lasso documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)",
193
+ min_value=1,
194
+ max_value=100000,
195
+ value=1000,
196
+ step=1,
197
+ )
198
+ selection_ = st.selectbox(
199
+ label="Selection scheme, see [Lasso documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)",
200
+ options=["selection", "random"],
201
+ index=0,
202
+ )
203
+ tol_ = st.number_input(
204
+ label="Convergence tolerance, see [Lasso documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)",
205
+ min_value=None,
206
+ max_value=None,
207
+ value=0.0001,
208
+ format="%0.4f",
209
+ step=0.0001,
210
+ )
211
+ model_kw = {
212
+ "max_iter": int(max_iter_),
213
+ "selection": selection_,
214
+ "random_state": 42,
215
+ "tol": tol_,
216
+ }
217
+
218
+ submit_button = st.form_submit_button(
219
+ "Start Building Model", icon=":material/start:"
220
+ )
221
+
222
+ # Build the model
223
+ if submit_button:
224
+ stop_btn = st.button(
225
+ "Stop Building Model",
226
+ type="primary",
227
+ icon=":material/block:",
228
+ )
229
+ with st.spinner(text="Building..."):
230
+ optimized_models, analyses = build_model(
231
+ _target=target,
232
+ _lib=lib,
233
+ _param_grid=param_grid,
234
+ _nmr_model=nmr_model,
235
+ _model_kw=model_kw,
236
+ )
237
+ st.success("Model has been built and cached!", icon="✅")
238
+
239
+ # Now present the analysis / results
240
+ with tab2_:
241
+ # import pickle
242
+
243
+ # optimized_models = [
244
+ # pickle.load(open("streamlit/example_model.pkl", "rb"))
245
+ # ] # TEMP
246
+ # analyses = [
247
+ # pickle.load(open("streamlit/example_analysis.pkl", "rb"))
248
+ # ] # TEMP
249
+
250
+ if len(optimized_models) > 0:
251
+ st.subheader(
252
+ "Observe how well the model fits the original spectrum."
253
+ )
254
+
255
+ model_ = optimized_models[0] # We only fit one model
256
+ analysis_ = analyses[0]
257
+
258
+ # Plot original vs. reconstructed and residual
259
+ col3_, col4_ = st.columns(2)
260
+ with col3_:
261
+ # Plot the substance with plotly
262
+ cmap_option3 = st.selectbox(
263
+ "Colormap",
264
+ ("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
265
+ index=0,
266
+ key="compare_orig",
267
+ )
268
+ st.plotly_chart(
269
+ target.plot(
270
+ absolute_values=True,
271
+ backend="plotly",
272
+ cmap=cmap_option3,
273
+ title="Original Spectrum",
274
+ ),
275
+ key="compare_orig_plot",
276
+ )
277
+
278
+ with col4_:
279
+ cmap_option4 = st.selectbox(
280
+ "Colormap",
281
+ ("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
282
+ index=0,
283
+ key="compare_recon",
284
+ )
285
+ st.plotly_chart(
286
+ model_.reconstruct().plot(
287
+ absolute_values=True,
288
+ backend="plotly",
289
+ cmap=cmap_option4,
290
+ title="Model Reconstruction",
291
+ ),
292
+ key="compare_recon_plot",
293
+ )
294
+
295
+ col5_, col6_ = st.columns(2)
296
+ with col5_:
297
+ cmap_option5 = st.selectbox(
298
+ "Colormap",
299
+ ("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
300
+ index=0,
301
+ key="compare_resid",
302
+ )
303
+ st.plotly_chart(
304
+ analysis_.build_residual().plot(
305
+ absolute_values=True,
306
+ backend="plotly",
307
+ cmap=cmap_option5,
308
+ title="Residual",
309
+ ),
310
+ key="compare_resid_plot",
311
+ )
312
+
313
+ # Plot most important spectra
314
+ max_n_ = len(analysis_._model.importances())
315
+ default_n_ = np.min([10, max_n_])
316
+ with col6_:
317
+ n_imp_ = st.slider(
318
+ label="Visualize the most important N spectra in the library",
319
+ value=default_n_,
320
+ min_value=1,
321
+ max_value=max_n_,
322
+ step=1,
323
+ )
324
+ st.plotly_chart(
325
+ analysis_.plot_top_importances(
326
+ k=n_imp_, by_name=True, backend="plotly"
327
+ ),
328
+ use_container_width=True,
329
+ )
330
+
331
+ st.divider()
332
+ st.subheader(
333
+ "Visualize the most important substances from the library used"
334
+ )
335
+
336
+ # Now plot the important spectra themselves
337
+ top_substances, top_importances = analysis_.get_top_substances(
338
+ k=n_imp_
339
+ )
340
+ n_cols_ = st.slider(
341
+ label="Number of columns",
342
+ value=3,
343
+ min_value=1,
344
+ max_value=n_imp_,
345
+ step=1,
346
+ )
347
+ n_rows_ = np.max([1, int(np.ceil(n_imp_ / n_cols_))])
348
+ ctr = 0
349
+ for row_idx in range(n_rows_):
350
+ for col_, col_idx in zip(st.columns(n_cols_), range(n_cols_)):
351
+ with col_:
352
+ if ctr < n_imp_:
353
+ cmap_option_ = st.selectbox(
354
+ "Colormap",
355
+ ("Reds", "Blues", "Viridis", "Plasma", "RdBu"),
356
+ index=0,
357
+ key=f"cmap_option_{ctr}_",
358
+ )
359
+ st.plotly_chart(
360
+ top_substances[ctr].plot(
361
+ absolute_values=True,
362
+ backend="plotly",
363
+ cmap=cmap_option_,
364
+ )
365
+ )
366
+ ctr += 1