Spaces:

varalakshmi55
/

SleepDetectionApp

Sleeping

App Files Files Community

varalakshmi55 commited on May 23

Commit

ea9f040

verified ·

1 Parent(s): dd17fed

Upload 2 files

Browse files

Files changed (2) hide show

pages/Dashboard.py +338 -0
pages/predict page.py +204 -0

pages/Dashboard.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from Utility.data_loader  import load_train_series,load_train_events,load_sample_submission,load_test_series
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from xgboost import XGBClassifier  # or XGBRegressor depending on your task
+import xgboost as xgb
+import numpy as np
+@st.cache_data
+def load_sampled_data():
+    # df3 = pd.read_parquet("train_series.parquet", columns=['series_id', 'step', 'anglez', 'enmo'])
+    # df4 = pd.read_parquet("test_series.parquet", columns=['series_id', 'step', 'anglez', 'enmo'])
+    df2 = pd.read_csv("train_events.csv")
+    # Sample safely based on available data
+    # df3_sample = df3.sample(n=min(5_000_000, len(df3)), random_state=42)
+    # df4_sample = df4.sample(n=min(1_000_000, len(df4)), random_state=42)
+    return df2
+# Load
+# df3, df4, df2 = load_sampled_data()
+df2 = load_sampled_data()
+# df = pd.concat([df3, df4], axis=0, ignore_index=True)
+# merged_df = pd.merge(df, df2, on=['series_id', 'step'], how='inner')
+merged_df = pd.read_csv("merged_df.csv")
+# Rename timestamp columns if they exist
+if 'timestamp_x' in merged_df.columns:
+    merged_df.rename(columns={'timestamp_x': 'sensor_timestamp'}, inplace=True)
+if 'timestamp_y' in merged_df.columns:
+    merged_df.rename(columns={'timestamp_y': 'event_timestamp'}, inplace=True)
+st.title("📊 Step Distribution Analysis")
+# Layout: 2 columns
+col1, col2 = st.columns([1, 1])  # Equal width
+# ----- Column 1: Boxplot -----
+with col1:
+    st.subheader("📦 Boxplot of Step")
+    fig, ax = plt.subplots(figsize=(6, 4))  # Adjusted for better visibility
+    sns.boxplot(x=df2['step'], ax=ax, color='steelblue')
+    ax.set_title("Distribution of Step Count", fontsize=14)
+    ax.set_xlabel("Step", fontsize=12)
+    st.pyplot (fig)
+# ----- Column 2: Insights -----
+with col2:
+    st.subheader("🧠 Insights from the Boxplot")
+    st.markdown("""
+        <small>
+        <b>Central Tendency:</b><br>
+        - The <b>median</b> is close to the center of the box, suggesting a fairly symmetric distribution within the interquartile range (IQR).<br>
+        <b>Spread:</b><br>
+        - A <b>wide IQR</b> indicates significant variability in the step counts across sessions.<br>
+        <b>Outliers:</b><br>
+        - The <b>dots on the right</b> are outliers — representing very high step counts.<br>
+        - These could reflect either:<br>
+        - <b>Legitimate long-duration recordings</b><br>
+        - Or <b>data quality issues</b> (e.g., duplication or sensor errors)
+        <b>Distribution Shape:</b><br>
+        - A <b>longer left whisker</b> implies a <b>left-skewed</b> distribution.<br>
+        - Most sessions have <b>lower step values</b>, with a few very high outliers.
+        </small>
+        """, unsafe_allow_html=True)
+#st.write("1. Data Visualization - Scatter Plot (feature vs feature or vs target)")
+# Assume merged_df is already defined or loaded
+df_sample = merged_df  # or use df_sample = merged_df.sample(n=50000) to downsample
+st.subheader("Scatter Plot: anglez vs enmo")
+col1, col2 = st.columns([1, 1])
+with col1:
+    #st.subheader("Scatter Plot: anglez vs enmo")
+    # fig, ax = plt.subplots(figsize=(6, 4))
+    # sns.scatterplot(x=df['anglez'], y=df['enmo'], ax=ax)
+    # ax.set_title("Scatter Plot: anglez vs enmo")
+    # st.pyplot(fig)
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(6, 4))
+    sns.scatterplot(x='anglez', y='enmo', data=df_sample, ax=ax)
+    ax.set_title("Scatter Plot: anglez vs enmo")
+    # Display in Streamlit
+    st.pyplot(fig)
+with col2:
+    st.markdown("""
+    <small>
+    <b>1. Clustered Points:</b> Most `enmo` values are near 0, suggesting low movement.<br>
+    <b>2. Symmetry:</b> Spread is balanced on both sides of anglez (±), indicating no directional bias.<br>
+    <b>3. Weak Correlation:</b> No visible trend, suggesting independence between `anglez` and `enmo`.<br>
+    <b>4. Outliers:</b> A few high `enmo` points may indicate sudden or intense movement.<br>
+    <b>5. Interpretation:</b> Most data reflects light activity or rest, regardless of body orientation.
+    </small>
+    """, unsafe_allow_html=True)
+# df_sample = merged_df.sample(n=10000)  # adjust sample size for performance
+# # Subheader
+# st.subheader("Pair Plot of Features")
+# # Create pairplot
+# fig = sns.pairplot(df_sample[['anglez', 'enmo', 'step']])
+# fig.fig.suptitle("Pair Plot of Features", y=1.02)
+# # Display in Streamlit
+# st.pyplot(fig)
+# Define columns to plot
+col1, col2 = st.columns([1, 1])  # Equal width
+# Column 1: Pair Plot
+with col1:
+    st.subheader("📈 Pair Plot of Features")
+    fig = sns.pairplot(merged_df[['anglez', 'enmo', 'step']])
+    st.pyplot(fig)
+# Column 2: Insights
+with col2:
+    st.subheader("🧠 Insights from Pair Plot")
+    st.markdown("""
+<div style='font-size: 14px'>
+### 📊 Distribution Insights:
+- **anglez**: Symmetric distribution peaking near -50 to 0.
+- **enmo**: Right-skewed, most values below 0.1.
+- **step**: Right-skewed, with a few large outliers.
+### 🔁 Pairwise Relationships:
+- **anglez vs enmo**: No clear trend; cone-like shape.
+- **anglez vs step**: No correlation; looks uniformly scattered.
+- **enmo vs step**: Clustered at low values. High steps sometimes with low enmo.
+### 💡 Summary:
+- Features appear largely **uncorrelated**.
+- Helps identify **data distributions** and potential **outliers**.
+- Can assist in **feature selection/engineering**.
+</div>
+""", unsafe_allow_html=True)
+# plot_columns = ['anglez', 'enmo', 'step']
+# # Safety check: make sure required columns exist
+# if all(col in merged_df.columns for col in plot_columns):
+#     # Check data size and sample accordingly
+#     max_rows = len(merged_df)
+#     sample_size = min(10000, max_rows)  # Don't exceed available rows
+#     df_sample = merged_df.sample(n=sample_size)
+#     # Subheader
+#     st.subheader("Pair Plot of Features")
+#     # Create pairplot
+#     fig = sns.pairplot(df_sample[plot_columns])
+#     fig.fig.suptitle("Pair Plot of Features", y=1.02)
+#     # Display in Streamlit
+#     st.pyplot(fig)
+# else:
+#     st.error("One or more required columns ('anglez', 'enmo', 'step') are missing in the dataset.")
+# Plot
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+sns.histplot(df_sample['anglez'], kde=True, bins=50, ax=axes[0])
+axes[0].set_title("Distribution of anglez")
+sns.histplot(df_sample['enmo'], kde=True, bins=50, ax=axes[1])
+axes[1].set_title("Distribution of enmo")
+plt.tight_layout()
+st.pyplot(fig)
+# Show insights side by side
+col1, col2 = st.columns(2)
+with col1:
+    st.markdown("""
+    <div style='font-size: 14px'>
+    <h3> 📈 Distribution of `anglez`: </h3>
+    - The distribution is **roughly symmetric**, centered around **-50 to 0**.
+    - It resembles a **left-heavy bell shape**, suggesting:
+      - Most sensor angles were **tilted negatively**.
+      - Indicates a **natural resting position** or specific posture.
+    </div>
+    """, unsafe_allow_html=True)
+with col2:
+    st.markdown("""
+    <div style='font-size: 14px'>
+    <h3> 📉 Distribution of `enmo`: </h3>
+    - Highly **right-skewed** (sharp peak near zero).
+    - The majority of `enmo` values are **very small** (< 0.05), indicating:
+      - **Minimal movement or low activity** in most sessions.
+      - Few data points reflect **moderate to high movement**.
+    </div>
+    """, unsafe_allow_html=True)
+# st.write("Multicollinearity Check - Correlation Matrix")
+# features = ['anglez', 'enmo', 'step', 'night']
+# df_subset = merged_df[features]
+# # Streamlit title
+# st.subheader("Multicollinearity Check - Correlation Matrix")
+# # Calculate correlation matrix
+# corr_matrix = df_subset.corr()
+# # Plot heatmap
+# fig, ax = plt.subplots(figsize=(6, 4))
+# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
+# ax.set_title("Correlation Matrix")
+# # Display in Streamlit
+# st.pyplot(fig)
+st.subheader("Multicollinearity Check - Correlation Matrix")
+# Select relevant features
+features = ['anglez', 'enmo', 'step', 'night']
+df_subset = merged_df[features]
+# Calculate correlation matrix
+corr_matrix = df_subset.corr()
+# Create plot
+fig, ax = plt.subplots(figsize=(6, 4))
+sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".3f", ax=ax)
+ax.set_title("Correlation Matrix")
+# Layout in two columns
+col1, col2 = st.columns(2)
+# Column 1: Heatmap
+with col1:
+    st.pyplot(fig)
+# Column 2: Textual Insights
+with col2:
+    st.markdown("""
+    ### 🔍 Insights from Correlation Matrix
+    - **`anglez` & `enmo`**:
+      🔸 Weak negative correlation (**-0.11**) — suggests minimal linear relationship.
+    - **`step` & `night`**:
+      ⚠️ Perfect correlation (**1.00**) — indicates **redundancy**, likely representing the same event in different forms.
+    - **Overall**:
+      ✅ Low multicollinearity across most features — safe for modeling.
+      📝 Recommend removing either `step` or `night` to reduce feature duplication.
+    """)
+# Encode
+le = LabelEncoder()
+merged_df['series_id'] = le.fit_transform(merged_df['series_id'])
+merged_df['event'] = le.fit_transform(merged_df['event'])
+# Drop columns with string or datetime values
+drop_cols = ['sensor_timestamp', 'event_timestamp', 'night', 'step', 'sleep_duration_hrs', 'series_id']
+df_cleaned = merged_df.drop(columns=[col for col in drop_cols if col in merged_df.columns])
+# Ensure only numeric features in X
+X = df_cleaned.drop('event', axis=1).select_dtypes(include=[np.number])
+y = merged_df['event']
+# Split and scale
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)
+st.subheader("Feature Importance")
+# Create model instance
+xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # example for classification
+# Fit the model
+xgb_model.fit(X_train, y_train)
+# Plot feature importance
+fig, ax = plt.subplots(figsize=(6, 4))
+xgb.plot_importance(xgb_model, ax=ax)
+ax.set_title("XGBoost Feature Importance")
+# Show in Streamlit
+st.subheader("XGBoost Feature Importance")
+col1, col2 = st.columns(2)
+# Column 1: Plot
+with col1:
+    st.pyplot(fig)
+    st.markdown("""
+#### 🚫 Low-Impact Features:
+- Features like `step` and `night` (excluded in this plot) showed **minimal or redundant contribution**.
+- 🔁 You may consider **removing** them to simplify the model.
+""")
+# Column 2: Insights
+with col2:
+ st.markdown("""
+<small>
+<h3> 🔍 XGBoost Feature Importance: Key Insights </h3>
+#### 📌 Top Features:
+- 🔹 **`anglez`** — Highest importance score (**1557**)
+- 🔹 **`enmo`** — Close second with score (**1546**)
+#### ✅ Summary:
+- Both `anglez` and `enmo` contribute **significantly** to the model.
+- Their high scores reflect **strong influence** in predicting the target variable.
+#### 💡 Interpretation:
+- These features likely capture **activity level** or **sleep posture** patterns.
+- Keeping both is **recommended** for accurate classification.
+</small>
+""", unsafe_allow_html=True)

pages/predict page.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# app.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
+from xgboost import XGBClassifier
+st.title("🧠 Sleep Event Prediction")
+# --- Load and preprocess data ---
+merged_df = pd.read_csv("merged_df.csv")
+st.subheader("Raw Data Sample")
+st.dataframe(merged_df.head())
+# Drop nulls in important columns
+merged_df = merged_df.dropna(subset=['night', 'event', 'event_timestamp'])
+# Convert timestamps
+merged_df['event_timestamp'] = pd.to_datetime(merged_df['event_timestamp'], format='%Y-%m-%dT%H:%M:%S%z', utc=True)
+merged_df['sensor_timestamp'] = pd.to_datetime(merged_df['sensor_timestamp'], format='%Y-%m-%dT%H:%M:%S%z', utc=True)
+# Calculate duration
+merged_df['sleep_duration_hrs'] = (merged_df['sensor_timestamp'] - merged_df['event_timestamp']).dt.total_seconds() / 3600
+# Encode categorical columns
+le_event = LabelEncoder()
+merged_df['event_encoded'] = le_event.fit_transform(merged_df['event'])
+le_series = LabelEncoder()
+merged_df['series_id_encoded'] = le_series.fit_transform(merged_df['series_id'])
+# Select features
+X = merged_df[['anglez', 'enmo']]
+y = merged_df['event_encoded']
+# Train-test split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+# Scale features
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+# Train model
+model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
+model.fit(X_train_scaled, y_train)
+# Evaluate model
+y_pred = model.predict(X_test_scaled)
+y_proba = model.predict_proba(X_test_scaled)
+accuracy = accuracy_score(y_test, y_pred)
+f1 = f1_score(y_test, y_pred, average='macro')
+# Handle binary or multiclass AUC
+if y_proba.shape[1] == 2:
+    roc = roc_auc_score(y_test, y_proba[:, 1])
+else:
+    roc = roc_auc_score(y_test, y_proba, multi_class='ovo', average='macro')
+# --- Predict User Input ---
+st.subheader("🔮 Predict Sleep Event")
+anglez = st.number_input("Enter anglez:", value=27.88, format="%.4f")
+enmo = st.number_input("Enter enmo:", value=0.00, format="%.4f")
+if st.button("Predict Sleep Event"):
+    input_data = np.array([[anglez, enmo]])
+    input_scaled = scaler.transform(input_data)
+    prediction = model.predict(input_scaled)[0]
+    predicted_label = le_event.inverse_transform([prediction])[0]
+    st.success(f"Predicted Sleep Event: {predicted_label}")
+# # app.py (your Streamlit file)
+# import streamlit as st
+# import numpy as np
+# # import pickle
+# from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
+# import pandas as pd
+# from sklearn.preprocessing import LabelEncoder,StandardScaler
+# from sklearn.model_selection import train_test_split
+# from xgboost import XGBClassifier
+# st.title("🧠 Sleep Event Prediction")
+# # --- Load Pickles ---
+# # @st.cache_resource
+# # def load_all():
+# #     with open("model.pkl", "rb") as f: model = pickle.load(f)
+# #     with open("scaler.pkl", "rb") as f: scaler = pickle.load(f)
+# #     with open("label_encoder.pkl", "rb") as f: le = pickle.load(f)
+# #     with open("X_test.pkl", "rb") as f: X_test = pickle.load(f)
+# #     with open("y_test.pkl", "rb") as f: y_test = pickle.load(f)
+# #     return model, scaler, le, X_test, y_test
+# merged_df=pd.read_csv("merged_df.csv")
+# st.dataframe(merged_df.head())
+# # Step 1: Drop rows with nulls in key columns
+# merged_df = merged_df.dropna(subset=['night', 'event', 'event_timestamp'])
+# # Step 2: Reset index (also avoid inplace)
+# merged_df = merged_df.reset_index(drop=True)
+# merged_df['event_timestamp'] = pd.to_datetime(merged_df['event_timestamp'], format='%Y-%m-%dT%H:%M:%S%z',utc=True)
+# merged_df['sensor_timestamp'] = pd.to_datetime(merged_df['sensor_timestamp'], format='%Y-%m-%dT%H:%M:%S%z',utc=True)
+# merged_df['sleep_duration_hrs'] = (merged_df['sensor_timestamp'] - merged_df['event_timestamp']).dt.total_seconds() / 3600
+# le = LabelEncoder()
+# merged_df['series_id'] = le.fit_transform(merged_df['series_id'])
+# merged_df['event'] = le.fit_transform(merged_df['event'])  # Target label
+# # columns_to_drop = ['sensor_timestamp', 'series_id', 'event_timestamp','night','sleep_duration_hrs','step']
+# # Drop specified columns and define features (X) and target (y)
+# # df_cleaned = merged_df.drop([col for col in columns_to_drop if col in merged_df.columns], axis=1)
+# # X = df_cleaned.drop('event', axis=1)
+# # y = df_cleaned['event']
+# X = merged_df[['anglez', 'enmo']]
+# y = merged_df['event']
+# # Train-test split
+# X_train, X_test, y_train, y_test = train_test_split(
+#     X, y, test_size=0.2
+# )
+# # 6. Scale features (optional for XGBoost but good practice)
+# scaler = StandardScaler()
+# X_train_scaled = scaler.fit_transform(X_train)
+# X_test_scaled = scaler.transform(X_test)
+# # 7. Train XGBoost model
+# # model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, reg_alpha=1, reg_lambda=1, eval_metric='logloss')
+# model = XGBClassifier()
+# model.fit(X_train_scaled, y_train)
+# # 8. Predict and Evaluate
+# y_pred = model.predict(X_test_scaled)
+# y_proba = model.predict_proba(X_test_scaled)
+# accuracy = accuracy_score(y_test, y_pred)
+# f1 = f1_score(y_test, y_pred, average='macro')
+# if y_proba.shape[1] == 2:
+#     roc = roc_auc_score(y_test, y_proba[:, 1])
+# else:
+#     roc = roc_auc_score(y_test, y_proba, multi_class='ovo', average='macro')
+# # --- Display Metrics ---
+# # st.subheader("Model Performance")
+# # st.metric("Accuracy", f"{accuracy:.4f}")
+# # st.metric("F1 Score", f"{f1:.4f}")
+# # st.metric("ROC AUC Score", f"{roc:.4f}")
+# # Create a DataFrame for metrics
+# # import pandas as pd
+# st.subheader("Model Performance")
+# # Create a DataFrame for metrics
+# metrics_df = pd.DataFrame({
+#     "Metric": ["Accuracy", "F1 Score", "ROC AUC Score"],
+#     "Value": [f"{accuracy:.4f}", f"{f1:.4f}", f"{roc:.4f}"]
+# })
+# # Display as table
+# st.table(metrics_df)
+# counts = merged_df["event"].value_counts()
+# st.markdown("**Event Value Counts:**")
+# st.markdown(counts.to_string())
+# # --- Predict User Input ---
+# st.subheader("Predict Sleep Event")
+# anglez = st.number_input("Enter anglez:", value=27.8800,format="%.4f")
+# enmo = st.number_input("Enter enmo:", value=0.0000,format="%.4f")
+# if st.button("Predict Sleep Event"):
+#     input_data = np.array([[anglez, enmo]])
+#     input_scaled = scaler.transform(input_data)
+#     prediction = model.predict(input_scaled)[0]
+#     label = le.inverse_transform([prediction])[0]
+#     st.success(f"Predicted Event: {label}")
+# Display class balance
+# Display metrics
+st.subheader("📊 Model Performance")
+metrics_df = pd.DataFrame({
+    "Metric": ["Accuracy", "F1 Score", "ROC AUC Score"],
+    "Value": [f"{accuracy:.4f}", f"{f1:.4f}", f"{roc:.4f}"]
+})
+st.table(metrics_df)
+st.subheader("📈 Event Value Counts")
+value_counts_df = merged_df["event"].value_counts().reset_index()
+value_counts_df.columns = ["Event", "Count"]
+st.dataframe(value_counts_df)