Canstralian's picture
Update app.py
5d05d60 verified
raw
history blame
4.41 kB
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Set the title of the app
st.title("Cybersecurity Model Training App")
# Sidebar for dataset upload and parameter selection
st.sidebar.header("Upload Dataset and Parameters")
# File uploader for dataset
uploaded_file = st.sidebar.file_uploader("Upload your CSV dataset", type=["csv"])
# Function to load and display dataset
def load_data(file):
data = pd.read_csv(file)
st.write("Dataset Preview:")
st.dataframe(data.head())
return data
# Load dataset if file is uploaded
if uploaded_file is not None:
data = load_data(uploaded_file)
# Select target variable
target = st.sidebar.selectbox("Select the target variable", data.columns)
# Select features
features = st.sidebar.multiselect("Select feature variables", [col for col in data.columns if col != target])
# Split ratio
test_size = st.sidebar.slider("Test size ratio", 0.1, 0.5, 0.3)
# Model selection
model_choice = st.sidebar.selectbox("Select Model", ["Random Forest", "Support Vector Machine", "Logistic Regression"])
# Hyperparameters
if model_choice == "Random Forest":
n_estimators = st.sidebar.slider("Number of trees in the forest", 10, 100, 50)
max_depth = st.sidebar.slider("Maximum depth of the tree", 1, 20, 10)
elif model_choice == "Support Vector Machine":
c_value = st.sidebar.slider("Regularization parameter (C)", 0.01, 10.0, 1.0)
kernel = st.sidebar.selectbox("Kernel type", ["linear", "rbf", "poly"])
elif model_choice == "Logistic Regression":
c_value = st.sidebar.slider("Inverse of regularization strength (C)", 0.01, 10.0, 1.0)
# Train model button
if st.sidebar.button("Train Model"):
if len(features) == 0:
st.warning("Please select at least one feature.")
else:
X = data[features]
y = data[target]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
# Initialize and train the model
if model_choice == "Random Forest":
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
elif model_choice == "Support Vector Machine":
from sklearn.svm import SVC
model = SVC(C=c_value, kernel=kernel, probability=True, random_state=42)
elif model_choice == "Logistic Regression":
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=c_value, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Display evaluation metrics
st.subheader("Model Evaluation")
st.text("Classification Report:")
st.text(classification_report(y_test, y_pred))
# Confusion matrix
st.text("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
st.pyplot(fig)
# Feature importance for Random Forest
if model_choice == "Random Forest":
st.subheader("Feature Importance")
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
st.bar_chart(feature_importance.set_index('Feature'))
# Instructions when no file is uploaded
else:
st.write("Please upload a CSV file to get started.")
# Additional resources
st.sidebar.header("Additional Resources")
st.sidebar.markdown("""
- [Streamlit Documentation](https://docs.streamlit.io/)
- [Scikit-learn Documentation](https://scikit-learn.org/stable/user_guide.html)
- [Cybersecurity Datasets](https://www.kaggle.com/datasets?search=cybersecurity)
""")