Upload 2 files
Browse files- pages/Dashboard.py +338 -0
- pages/predict page.py +204 -0
pages/Dashboard.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
from Utility.data_loader import load_train_series,load_train_events,load_sample_submission,load_test_series
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
8 |
+
from xgboost import XGBClassifier # or XGBRegressor depending on your task
|
9 |
+
import xgboost as xgb
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
@st.cache_data
|
13 |
+
def load_sampled_data():
|
14 |
+
# df3 = pd.read_parquet("train_series.parquet", columns=['series_id', 'step', 'anglez', 'enmo'])
|
15 |
+
# df4 = pd.read_parquet("test_series.parquet", columns=['series_id', 'step', 'anglez', 'enmo'])
|
16 |
+
df2 = pd.read_csv("train_events.csv")
|
17 |
+
|
18 |
+
# Sample safely based on available data
|
19 |
+
# df3_sample = df3.sample(n=min(5_000_000, len(df3)), random_state=42)
|
20 |
+
# df4_sample = df4.sample(n=min(1_000_000, len(df4)), random_state=42)
|
21 |
+
|
22 |
+
return df2
|
23 |
+
|
24 |
+
# Load
|
25 |
+
# df3, df4, df2 = load_sampled_data()
|
26 |
+
df2 = load_sampled_data()
|
27 |
+
# df = pd.concat([df3, df4], axis=0, ignore_index=True)
|
28 |
+
# merged_df = pd.merge(df, df2, on=['series_id', 'step'], how='inner')
|
29 |
+
|
30 |
+
merged_df = pd.read_csv("merged_df.csv")
|
31 |
+
|
32 |
+
# Rename timestamp columns if they exist
|
33 |
+
if 'timestamp_x' in merged_df.columns:
|
34 |
+
merged_df.rename(columns={'timestamp_x': 'sensor_timestamp'}, inplace=True)
|
35 |
+
if 'timestamp_y' in merged_df.columns:
|
36 |
+
merged_df.rename(columns={'timestamp_y': 'event_timestamp'}, inplace=True)
|
37 |
+
|
38 |
+
st.title("๐ Step Distribution Analysis")
|
39 |
+
|
40 |
+
# Layout: 2 columns
|
41 |
+
col1, col2 = st.columns([1, 1]) # Equal width
|
42 |
+
# ----- Column 1: Boxplot -----
|
43 |
+
with col1:
|
44 |
+
st.subheader("๐ฆ Boxplot of Step")
|
45 |
+
fig, ax = plt.subplots(figsize=(6, 4)) # Adjusted for better visibility
|
46 |
+
sns.boxplot(x=df2['step'], ax=ax, color='steelblue')
|
47 |
+
ax.set_title("Distribution of Step Count", fontsize=14)
|
48 |
+
ax.set_xlabel("Step", fontsize=12)
|
49 |
+
st.pyplot (fig)
|
50 |
+
|
51 |
+
# ----- Column 2: Insights -----
|
52 |
+
with col2:
|
53 |
+
st.subheader("๐ง Insights from the Boxplot")
|
54 |
+
st.markdown("""
|
55 |
+
<small>
|
56 |
+
<b>Central Tendency:</b><br>
|
57 |
+
- The <b>median</b> is close to the center of the box, suggesting a fairly symmetric distribution within the interquartile range (IQR).<br>
|
58 |
+
<b>Spread:</b><br>
|
59 |
+
- A <b>wide IQR</b> indicates significant variability in the step counts across sessions.<br>
|
60 |
+
<b>Outliers:</b><br>
|
61 |
+
- The <b>dots on the right</b> are outliers โ representing very high step counts.<br>
|
62 |
+
- These could reflect either:<br>
|
63 |
+
- <b>Legitimate long-duration recordings</b><br>
|
64 |
+
- Or <b>data quality issues</b> (e.g., duplication or sensor errors)
|
65 |
+
<b>Distribution Shape:</b><br>
|
66 |
+
- A <b>longer left whisker</b> implies a <b>left-skewed</b> distribution.<br>
|
67 |
+
- Most sessions have <b>lower step values</b>, with a few very high outliers.
|
68 |
+
</small>
|
69 |
+
""", unsafe_allow_html=True)
|
70 |
+
|
71 |
+
|
72 |
+
#st.write("1. Data Visualization - Scatter Plot (feature vs feature or vs target)")
|
73 |
+
# Assume merged_df is already defined or loaded
|
74 |
+
df_sample = merged_df # or use df_sample = merged_df.sample(n=50000) to downsample
|
75 |
+
|
76 |
+
st.subheader("Scatter Plot: anglez vs enmo")
|
77 |
+
|
78 |
+
col1, col2 = st.columns([1, 1])
|
79 |
+
|
80 |
+
with col1:
|
81 |
+
#st.subheader("Scatter Plot: anglez vs enmo")
|
82 |
+
# fig, ax = plt.subplots(figsize=(6, 4))
|
83 |
+
# sns.scatterplot(x=df['anglez'], y=df['enmo'], ax=ax)
|
84 |
+
# ax.set_title("Scatter Plot: anglez vs enmo")
|
85 |
+
# st.pyplot(fig)
|
86 |
+
|
87 |
+
# Create the plot
|
88 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
89 |
+
sns.scatterplot(x='anglez', y='enmo', data=df_sample, ax=ax)
|
90 |
+
ax.set_title("Scatter Plot: anglez vs enmo")
|
91 |
+
|
92 |
+
# Display in Streamlit
|
93 |
+
st.pyplot(fig)
|
94 |
+
|
95 |
+
with col2:
|
96 |
+
st.markdown("""
|
97 |
+
<small>
|
98 |
+
<b>1. Clustered Points:</b> Most `enmo` values are near 0, suggesting low movement.<br>
|
99 |
+
<b>2. Symmetry:</b> Spread is balanced on both sides of anglez (ยฑ), indicating no directional bias.<br>
|
100 |
+
<b>3. Weak Correlation:</b> No visible trend, suggesting independence between `anglez` and `enmo`.<br>
|
101 |
+
<b>4. Outliers:</b> A few high `enmo` points may indicate sudden or intense movement.<br>
|
102 |
+
<b>5. Interpretation:</b> Most data reflects light activity or rest, regardless of body orientation.
|
103 |
+
</small>
|
104 |
+
""", unsafe_allow_html=True)
|
105 |
+
|
106 |
+
|
107 |
+
# df_sample = merged_df.sample(n=10000) # adjust sample size for performance
|
108 |
+
|
109 |
+
# # Subheader
|
110 |
+
# st.subheader("Pair Plot of Features")
|
111 |
+
|
112 |
+
# # Create pairplot
|
113 |
+
# fig = sns.pairplot(df_sample[['anglez', 'enmo', 'step']])
|
114 |
+
# fig.fig.suptitle("Pair Plot of Features", y=1.02)
|
115 |
+
|
116 |
+
# # Display in Streamlit
|
117 |
+
# st.pyplot(fig)
|
118 |
+
# Define columns to plot
|
119 |
+
|
120 |
+
col1, col2 = st.columns([1, 1]) # Equal width
|
121 |
+
|
122 |
+
# Column 1: Pair Plot
|
123 |
+
with col1:
|
124 |
+
st.subheader("๐ Pair Plot of Features")
|
125 |
+
fig = sns.pairplot(merged_df[['anglez', 'enmo', 'step']])
|
126 |
+
st.pyplot(fig)
|
127 |
+
|
128 |
+
# Column 2: Insights
|
129 |
+
with col2:
|
130 |
+
st.subheader("๐ง Insights from Pair Plot")
|
131 |
+
st.markdown("""
|
132 |
+
<div style='font-size: 14px'>
|
133 |
+
|
134 |
+
### ๐ Distribution Insights:
|
135 |
+
- **anglez**: Symmetric distribution peaking near -50 to 0.
|
136 |
+
- **enmo**: Right-skewed, most values below 0.1.
|
137 |
+
- **step**: Right-skewed, with a few large outliers.
|
138 |
+
|
139 |
+
### ๐ Pairwise Relationships:
|
140 |
+
- **anglez vs enmo**: No clear trend; cone-like shape.
|
141 |
+
- **anglez vs step**: No correlation; looks uniformly scattered.
|
142 |
+
- **enmo vs step**: Clustered at low values. High steps sometimes with low enmo.
|
143 |
+
|
144 |
+
### ๐ก Summary:
|
145 |
+
- Features appear largely **uncorrelated**.
|
146 |
+
- Helps identify **data distributions** and potential **outliers**.
|
147 |
+
- Can assist in **feature selection/engineering**.
|
148 |
+
|
149 |
+
</div>
|
150 |
+
""", unsafe_allow_html=True)
|
151 |
+
|
152 |
+
# plot_columns = ['anglez', 'enmo', 'step']
|
153 |
+
|
154 |
+
# # Safety check: make sure required columns exist
|
155 |
+
# if all(col in merged_df.columns for col in plot_columns):
|
156 |
+
|
157 |
+
# # Check data size and sample accordingly
|
158 |
+
# max_rows = len(merged_df)
|
159 |
+
# sample_size = min(10000, max_rows) # Don't exceed available rows
|
160 |
+
|
161 |
+
# df_sample = merged_df.sample(n=sample_size)
|
162 |
+
|
163 |
+
# # Subheader
|
164 |
+
# st.subheader("Pair Plot of Features")
|
165 |
+
|
166 |
+
# # Create pairplot
|
167 |
+
# fig = sns.pairplot(df_sample[plot_columns])
|
168 |
+
# fig.fig.suptitle("Pair Plot of Features", y=1.02)
|
169 |
+
|
170 |
+
# # Display in Streamlit
|
171 |
+
# st.pyplot(fig)
|
172 |
+
|
173 |
+
# else:
|
174 |
+
# st.error("One or more required columns ('anglez', 'enmo', 'step') are missing in the dataset.")
|
175 |
+
|
176 |
+
|
177 |
+
# Plot
|
178 |
+
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
179 |
+
|
180 |
+
sns.histplot(df_sample['anglez'], kde=True, bins=50, ax=axes[0])
|
181 |
+
axes[0].set_title("Distribution of anglez")
|
182 |
+
|
183 |
+
sns.histplot(df_sample['enmo'], kde=True, bins=50, ax=axes[1])
|
184 |
+
axes[1].set_title("Distribution of enmo")
|
185 |
+
|
186 |
+
plt.tight_layout()
|
187 |
+
st.pyplot(fig)
|
188 |
+
|
189 |
+
# Show insights side by side
|
190 |
+
col1, col2 = st.columns(2)
|
191 |
+
|
192 |
+
with col1:
|
193 |
+
st.markdown("""
|
194 |
+
<div style='font-size: 14px'>
|
195 |
+
<h3> ๐ Distribution of `anglez`: </h3>
|
196 |
+
- The distribution is **roughly symmetric**, centered around **-50 to 0**.
|
197 |
+
- It resembles a **left-heavy bell shape**, suggesting:
|
198 |
+
- Most sensor angles were **tilted negatively**.
|
199 |
+
- Indicates a **natural resting position** or specific posture.
|
200 |
+
</div>
|
201 |
+
""", unsafe_allow_html=True)
|
202 |
+
|
203 |
+
with col2:
|
204 |
+
st.markdown("""
|
205 |
+
<div style='font-size: 14px'>
|
206 |
+
<h3> ๐ Distribution of `enmo`: </h3>
|
207 |
+
- Highly **right-skewed** (sharp peak near zero).
|
208 |
+
- The majority of `enmo` values are **very small** (< 0.05), indicating:
|
209 |
+
- **Minimal movement or low activity** in most sessions.
|
210 |
+
- Few data points reflect **moderate to high movement**.
|
211 |
+
</div>
|
212 |
+
""", unsafe_allow_html=True)
|
213 |
+
|
214 |
+
|
215 |
+
|
216 |
+
# st.write("Multicollinearity Check - Correlation Matrix")
|
217 |
+
# features = ['anglez', 'enmo', 'step', 'night']
|
218 |
+
# df_subset = merged_df[features]
|
219 |
+
|
220 |
+
# # Streamlit title
|
221 |
+
# st.subheader("Multicollinearity Check - Correlation Matrix")
|
222 |
+
|
223 |
+
# # Calculate correlation matrix
|
224 |
+
# corr_matrix = df_subset.corr()
|
225 |
+
|
226 |
+
# # Plot heatmap
|
227 |
+
# fig, ax = plt.subplots(figsize=(6, 4))
|
228 |
+
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
|
229 |
+
# ax.set_title("Correlation Matrix")
|
230 |
+
|
231 |
+
# # Display in Streamlit
|
232 |
+
# st.pyplot(fig)
|
233 |
+
|
234 |
+
|
235 |
+
st.subheader("Multicollinearity Check - Correlation Matrix")
|
236 |
+
|
237 |
+
# Select relevant features
|
238 |
+
features = ['anglez', 'enmo', 'step', 'night']
|
239 |
+
df_subset = merged_df[features]
|
240 |
+
|
241 |
+
# Calculate correlation matrix
|
242 |
+
corr_matrix = df_subset.corr()
|
243 |
+
|
244 |
+
# Create plot
|
245 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
246 |
+
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".3f", ax=ax)
|
247 |
+
ax.set_title("Correlation Matrix")
|
248 |
+
|
249 |
+
# Layout in two columns
|
250 |
+
col1, col2 = st.columns(2)
|
251 |
+
|
252 |
+
# Column 1: Heatmap
|
253 |
+
with col1:
|
254 |
+
st.pyplot(fig)
|
255 |
+
|
256 |
+
# Column 2: Textual Insights
|
257 |
+
with col2:
|
258 |
+
st.markdown("""
|
259 |
+
### ๐ Insights from Correlation Matrix
|
260 |
+
|
261 |
+
- **`anglez` & `enmo`**:
|
262 |
+
๐ธ Weak negative correlation (**-0.11**) โ suggests minimal linear relationship.
|
263 |
+
|
264 |
+
- **`step` & `night`**:
|
265 |
+
โ ๏ธ Perfect correlation (**1.00**) โ indicates **redundancy**, likely representing the same event in different forms.
|
266 |
+
|
267 |
+
- **Overall**:
|
268 |
+
โ
Low multicollinearity across most features โ safe for modeling.
|
269 |
+
๐ Recommend removing either `step` or `night` to reduce feature duplication.
|
270 |
+
""")
|
271 |
+
|
272 |
+
|
273 |
+
# Encode
|
274 |
+
le = LabelEncoder()
|
275 |
+
merged_df['series_id'] = le.fit_transform(merged_df['series_id'])
|
276 |
+
merged_df['event'] = le.fit_transform(merged_df['event'])
|
277 |
+
|
278 |
+
# Drop columns with string or datetime values
|
279 |
+
drop_cols = ['sensor_timestamp', 'event_timestamp', 'night', 'step', 'sleep_duration_hrs', 'series_id']
|
280 |
+
df_cleaned = merged_df.drop(columns=[col for col in drop_cols if col in merged_df.columns])
|
281 |
+
|
282 |
+
# Ensure only numeric features in X
|
283 |
+
X = df_cleaned.drop('event', axis=1).select_dtypes(include=[np.number])
|
284 |
+
y = merged_df['event']
|
285 |
+
|
286 |
+
# Split and scale
|
287 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)
|
288 |
+
|
289 |
+
st.subheader("Feature Importance")
|
290 |
+
# Create model instance
|
291 |
+
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss') # example for classification
|
292 |
+
|
293 |
+
# Fit the model
|
294 |
+
xgb_model.fit(X_train, y_train)
|
295 |
+
|
296 |
+
# Plot feature importance
|
297 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
298 |
+
xgb.plot_importance(xgb_model, ax=ax)
|
299 |
+
ax.set_title("XGBoost Feature Importance")
|
300 |
+
|
301 |
+
# Show in Streamlit
|
302 |
+
st.subheader("XGBoost Feature Importance")
|
303 |
+
|
304 |
+
|
305 |
+
|
306 |
+
col1, col2 = st.columns(2)
|
307 |
+
|
308 |
+
# Column 1: Plot
|
309 |
+
with col1:
|
310 |
+
st.pyplot(fig)
|
311 |
+
st.markdown("""
|
312 |
+
#### ๐ซ Low-Impact Features:
|
313 |
+
- Features like `step` and `night` (excluded in this plot) showed **minimal or redundant contribution**.
|
314 |
+
- ๐ You may consider **removing** them to simplify the model.
|
315 |
+
""")
|
316 |
+
# Column 2: Insights
|
317 |
+
with col2:
|
318 |
+
st.markdown("""
|
319 |
+
<small>
|
320 |
+
<h3> ๐ XGBoost Feature Importance: Key Insights </h3>
|
321 |
+
|
322 |
+
#### ๐ Top Features:
|
323 |
+
- ๐น **`anglez`** โ Highest importance score (**1557**)
|
324 |
+
- ๐น **`enmo`** โ Close second with score (**1546**)
|
325 |
+
|
326 |
+
#### โ
Summary:
|
327 |
+
- Both `anglez` and `enmo` contribute **significantly** to the model.
|
328 |
+
- Their high scores reflect **strong influence** in predicting the target variable.
|
329 |
+
|
330 |
+
#### ๐ก Interpretation:
|
331 |
+
- These features likely capture **activity level** or **sleep posture** patterns.
|
332 |
+
- Keeping both is **recommended** for accurate classification.
|
333 |
+
</small>
|
334 |
+
|
335 |
+
""", unsafe_allow_html=True)
|
336 |
+
|
337 |
+
|
338 |
+
|
pages/predict page.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
|
8 |
+
from xgboost import XGBClassifier
|
9 |
+
|
10 |
+
|
11 |
+
st.title("๐ง Sleep Event Prediction")
|
12 |
+
|
13 |
+
# --- Load and preprocess data ---
|
14 |
+
merged_df = pd.read_csv("merged_df.csv")
|
15 |
+
st.subheader("Raw Data Sample")
|
16 |
+
st.dataframe(merged_df.head())
|
17 |
+
|
18 |
+
# Drop nulls in important columns
|
19 |
+
merged_df = merged_df.dropna(subset=['night', 'event', 'event_timestamp'])
|
20 |
+
|
21 |
+
# Convert timestamps
|
22 |
+
merged_df['event_timestamp'] = pd.to_datetime(merged_df['event_timestamp'], format='%Y-%m-%dT%H:%M:%S%z', utc=True)
|
23 |
+
merged_df['sensor_timestamp'] = pd.to_datetime(merged_df['sensor_timestamp'], format='%Y-%m-%dT%H:%M:%S%z', utc=True)
|
24 |
+
|
25 |
+
# Calculate duration
|
26 |
+
merged_df['sleep_duration_hrs'] = (merged_df['sensor_timestamp'] - merged_df['event_timestamp']).dt.total_seconds() / 3600
|
27 |
+
|
28 |
+
# Encode categorical columns
|
29 |
+
le_event = LabelEncoder()
|
30 |
+
merged_df['event_encoded'] = le_event.fit_transform(merged_df['event'])
|
31 |
+
|
32 |
+
le_series = LabelEncoder()
|
33 |
+
merged_df['series_id_encoded'] = le_series.fit_transform(merged_df['series_id'])
|
34 |
+
|
35 |
+
# Select features
|
36 |
+
X = merged_df[['anglez', 'enmo']]
|
37 |
+
y = merged_df['event_encoded']
|
38 |
+
|
39 |
+
# Train-test split
|
40 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
41 |
+
|
42 |
+
# Scale features
|
43 |
+
scaler = StandardScaler()
|
44 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
45 |
+
X_test_scaled = scaler.transform(X_test)
|
46 |
+
|
47 |
+
# Train model
|
48 |
+
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
|
49 |
+
model.fit(X_train_scaled, y_train)
|
50 |
+
|
51 |
+
# Evaluate model
|
52 |
+
y_pred = model.predict(X_test_scaled)
|
53 |
+
y_proba = model.predict_proba(X_test_scaled)
|
54 |
+
|
55 |
+
accuracy = accuracy_score(y_test, y_pred)
|
56 |
+
f1 = f1_score(y_test, y_pred, average='macro')
|
57 |
+
|
58 |
+
# Handle binary or multiclass AUC
|
59 |
+
if y_proba.shape[1] == 2:
|
60 |
+
roc = roc_auc_score(y_test, y_proba[:, 1])
|
61 |
+
else:
|
62 |
+
roc = roc_auc_score(y_test, y_proba, multi_class='ovo', average='macro')
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
# --- Predict User Input ---
|
68 |
+
st.subheader("๐ฎ Predict Sleep Event")
|
69 |
+
anglez = st.number_input("Enter anglez:", value=27.88, format="%.4f")
|
70 |
+
enmo = st.number_input("Enter enmo:", value=0.00, format="%.4f")
|
71 |
+
|
72 |
+
if st.button("Predict Sleep Event"):
|
73 |
+
input_data = np.array([[anglez, enmo]])
|
74 |
+
input_scaled = scaler.transform(input_data)
|
75 |
+
prediction = model.predict(input_scaled)[0]
|
76 |
+
predicted_label = le_event.inverse_transform([prediction])[0]
|
77 |
+
st.success(f"Predicted Sleep Event: {predicted_label}")
|
78 |
+
|
79 |
+
|
80 |
+
# # app.py (your Streamlit file)
|
81 |
+
# import streamlit as st
|
82 |
+
# import numpy as np
|
83 |
+
# # import pickle
|
84 |
+
# from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
|
85 |
+
# import pandas as pd
|
86 |
+
# from sklearn.preprocessing import LabelEncoder,StandardScaler
|
87 |
+
# from sklearn.model_selection import train_test_split
|
88 |
+
# from xgboost import XGBClassifier
|
89 |
+
|
90 |
+
# st.title("๐ง Sleep Event Prediction")
|
91 |
+
|
92 |
+
# # --- Load Pickles ---
|
93 |
+
# # @st.cache_resource
|
94 |
+
# # def load_all():
|
95 |
+
# # with open("model.pkl", "rb") as f: model = pickle.load(f)
|
96 |
+
# # with open("scaler.pkl", "rb") as f: scaler = pickle.load(f)
|
97 |
+
# # with open("label_encoder.pkl", "rb") as f: le = pickle.load(f)
|
98 |
+
# # with open("X_test.pkl", "rb") as f: X_test = pickle.load(f)
|
99 |
+
# # with open("y_test.pkl", "rb") as f: y_test = pickle.load(f)
|
100 |
+
# # return model, scaler, le, X_test, y_test
|
101 |
+
|
102 |
+
# merged_df=pd.read_csv("merged_df.csv")
|
103 |
+
# st.dataframe(merged_df.head())
|
104 |
+
# # Step 1: Drop rows with nulls in key columns
|
105 |
+
# merged_df = merged_df.dropna(subset=['night', 'event', 'event_timestamp'])
|
106 |
+
|
107 |
+
# # Step 2: Reset index (also avoid inplace)
|
108 |
+
# merged_df = merged_df.reset_index(drop=True)
|
109 |
+
# merged_df['event_timestamp'] = pd.to_datetime(merged_df['event_timestamp'], format='%Y-%m-%dT%H:%M:%S%z',utc=True)
|
110 |
+
# merged_df['sensor_timestamp'] = pd.to_datetime(merged_df['sensor_timestamp'], format='%Y-%m-%dT%H:%M:%S%z',utc=True)
|
111 |
+
# merged_df['sleep_duration_hrs'] = (merged_df['sensor_timestamp'] - merged_df['event_timestamp']).dt.total_seconds() / 3600
|
112 |
+
|
113 |
+
# le = LabelEncoder()
|
114 |
+
# merged_df['series_id'] = le.fit_transform(merged_df['series_id'])
|
115 |
+
# merged_df['event'] = le.fit_transform(merged_df['event']) # Target label
|
116 |
+
|
117 |
+
# # columns_to_drop = ['sensor_timestamp', 'series_id', 'event_timestamp','night','sleep_duration_hrs','step']
|
118 |
+
|
119 |
+
# # Drop specified columns and define features (X) and target (y)
|
120 |
+
# # df_cleaned = merged_df.drop([col for col in columns_to_drop if col in merged_df.columns], axis=1)
|
121 |
+
|
122 |
+
# # X = df_cleaned.drop('event', axis=1)
|
123 |
+
# # y = df_cleaned['event']
|
124 |
+
|
125 |
+
# X = merged_df[['anglez', 'enmo']]
|
126 |
+
# y = merged_df['event']
|
127 |
+
|
128 |
+
# # Train-test split
|
129 |
+
# X_train, X_test, y_train, y_test = train_test_split(
|
130 |
+
# X, y, test_size=0.2
|
131 |
+
# )
|
132 |
+
|
133 |
+
# # 6. Scale features (optional for XGBoost but good practice)
|
134 |
+
# scaler = StandardScaler()
|
135 |
+
# X_train_scaled = scaler.fit_transform(X_train)
|
136 |
+
# X_test_scaled = scaler.transform(X_test)
|
137 |
+
|
138 |
+
# # 7. Train XGBoost model
|
139 |
+
# # model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, reg_alpha=1, reg_lambda=1, eval_metric='logloss')
|
140 |
+
# model = XGBClassifier()
|
141 |
+
# model.fit(X_train_scaled, y_train)
|
142 |
+
|
143 |
+
# # 8. Predict and Evaluate
|
144 |
+
# y_pred = model.predict(X_test_scaled)
|
145 |
+
# y_proba = model.predict_proba(X_test_scaled)
|
146 |
+
|
147 |
+
# accuracy = accuracy_score(y_test, y_pred)
|
148 |
+
# f1 = f1_score(y_test, y_pred, average='macro')
|
149 |
+
|
150 |
+
# if y_proba.shape[1] == 2:
|
151 |
+
# roc = roc_auc_score(y_test, y_proba[:, 1])
|
152 |
+
# else:
|
153 |
+
# roc = roc_auc_score(y_test, y_proba, multi_class='ovo', average='macro')
|
154 |
+
|
155 |
+
|
156 |
+
# # --- Display Metrics ---
|
157 |
+
# # st.subheader("Model Performance")
|
158 |
+
# # st.metric("Accuracy", f"{accuracy:.4f}")
|
159 |
+
# # st.metric("F1 Score", f"{f1:.4f}")
|
160 |
+
# # st.metric("ROC AUC Score", f"{roc:.4f}")
|
161 |
+
|
162 |
+
# # Create a DataFrame for metrics
|
163 |
+
# # import pandas as pd
|
164 |
+
|
165 |
+
# st.subheader("Model Performance")
|
166 |
+
|
167 |
+
# # Create a DataFrame for metrics
|
168 |
+
# metrics_df = pd.DataFrame({
|
169 |
+
# "Metric": ["Accuracy", "F1 Score", "ROC AUC Score"],
|
170 |
+
# "Value": [f"{accuracy:.4f}", f"{f1:.4f}", f"{roc:.4f}"]
|
171 |
+
# })
|
172 |
+
|
173 |
+
# # Display as table
|
174 |
+
# st.table(metrics_df)
|
175 |
+
|
176 |
+
# counts = merged_df["event"].value_counts()
|
177 |
+
# st.markdown("**Event Value Counts:**")
|
178 |
+
# st.markdown(counts.to_string())
|
179 |
+
|
180 |
+
# # --- Predict User Input ---
|
181 |
+
# st.subheader("Predict Sleep Event")
|
182 |
+
# anglez = st.number_input("Enter anglez:", value=27.8800,format="%.4f")
|
183 |
+
# enmo = st.number_input("Enter enmo:", value=0.0000,format="%.4f")
|
184 |
+
|
185 |
+
# if st.button("Predict Sleep Event"):
|
186 |
+
# input_data = np.array([[anglez, enmo]])
|
187 |
+
# input_scaled = scaler.transform(input_data)
|
188 |
+
# prediction = model.predict(input_scaled)[0]
|
189 |
+
# label = le.inverse_transform([prediction])[0]
|
190 |
+
# st.success(f"Predicted Event: {label}")
|
191 |
+
# Display class balance
|
192 |
+
|
193 |
+
# Display metrics
|
194 |
+
st.subheader("๐ Model Performance")
|
195 |
+
metrics_df = pd.DataFrame({
|
196 |
+
"Metric": ["Accuracy", "F1 Score", "ROC AUC Score"],
|
197 |
+
"Value": [f"{accuracy:.4f}", f"{f1:.4f}", f"{roc:.4f}"]
|
198 |
+
})
|
199 |
+
st.table(metrics_df)
|
200 |
+
|
201 |
+
st.subheader("๐ Event Value Counts")
|
202 |
+
value_counts_df = merged_df["event"].value_counts().reset_index()
|
203 |
+
value_counts_df.columns = ["Event", "Count"]
|
204 |
+
st.dataframe(value_counts_df)
|