|
import streamlit as st |
|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.metrics import classification_report, confusion_matrix |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
|
|
st.title("Cybersecurity Model Training App") |
|
|
|
|
|
st.sidebar.header("Upload Dataset and Parameters") |
|
|
|
|
|
uploaded_file = st.sidebar.file_uploader("Upload your CSV dataset", type=["csv"]) |
|
|
|
|
|
def load_data(file): |
|
data = pd.read_csv(file) |
|
st.write("Dataset Preview:") |
|
st.dataframe(data.head()) |
|
return data |
|
|
|
|
|
if uploaded_file is not None: |
|
data = load_data(uploaded_file) |
|
|
|
|
|
target = st.sidebar.selectbox("Select the target variable", data.columns) |
|
|
|
|
|
features = st.sidebar.multiselect("Select feature variables", [col for col in data.columns if col != target]) |
|
|
|
|
|
test_size = st.sidebar.slider("Test size ratio", 0.1, 0.5, 0.3) |
|
|
|
|
|
model_choice = st.sidebar.selectbox("Select Model", ["Random Forest", "Support Vector Machine", "Logistic Regression"]) |
|
|
|
|
|
if model_choice == "Random Forest": |
|
n_estimators = st.sidebar.slider("Number of trees in the forest", 10, 100, 50) |
|
max_depth = st.sidebar.slider("Maximum depth of the tree", 1, 20, 10) |
|
elif model_choice == "Support Vector Machine": |
|
c_value = st.sidebar.slider("Regularization parameter (C)", 0.01, 10.0, 1.0) |
|
kernel = st.sidebar.selectbox("Kernel type", ["linear", "rbf", "poly"]) |
|
elif model_choice == "Logistic Regression": |
|
c_value = st.sidebar.slider("Inverse of regularization strength (C)", 0.01, 10.0, 1.0) |
|
|
|
|
|
if st.sidebar.button("Train Model"): |
|
if len(features) == 0: |
|
st.warning("Please select at least one feature.") |
|
else: |
|
X = data[features] |
|
y = data[target] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) |
|
|
|
|
|
if model_choice == "Random Forest": |
|
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42) |
|
elif model_choice == "Support Vector Machine": |
|
from sklearn.svm import SVC |
|
model = SVC(C=c_value, kernel=kernel, probability=True, random_state=42) |
|
elif model_choice == "Logistic Regression": |
|
from sklearn.linear_model import LogisticRegression |
|
model = LogisticRegression(C=c_value, max_iter=1000, random_state=42) |
|
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = model.predict(X_test) |
|
|
|
|
|
st.subheader("Model Evaluation") |
|
st.text("Classification Report:") |
|
st.text(classification_report(y_test, y_pred)) |
|
|
|
|
|
st.text("Confusion Matrix:") |
|
cm = confusion_matrix(y_test, y_pred) |
|
fig, ax = plt.subplots() |
|
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_) |
|
plt.xlabel("Predicted") |
|
plt.ylabel("Actual") |
|
st.pyplot(fig) |
|
|
|
|
|
if model_choice == "Random Forest": |
|
st.subheader("Feature Importance") |
|
feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_}) |
|
feature_importance = feature_importance.sort_values(by='Importance', ascending=False) |
|
st.bar_chart(feature_importance.set_index('Feature')) |
|
|
|
|
|
else: |
|
st.write("Please upload a CSV file to get started.") |
|
|
|
|
|
st.sidebar.header("Additional Resources") |
|
st.sidebar.markdown(""" |
|
- [Streamlit Documentation](https://docs.streamlit.io/) |
|
- [Scikit-learn Documentation](https://scikit-learn.org/stable/user_guide.html) |
|
- [Cybersecurity Datasets](https://www.kaggle.com/datasets?search=cybersecurity) |
|
""") |