Spaces:

pujanpaudel
/

cal-housing

Sleeping

App Files Files Community

cal-housing / src /model.py

pujanpaudel

Update src/model.py

d77c50a verified 23 days ago

raw

history blame contribute delete

2.97 kB

	import sys
	sys.path.append('..')

	import pandas as pd
	import os
	import random
	import joblib
	from time import perf_counter

	from preprocessing import outlier_detection,box_cox_transformation
	from sklearn.linear_model import LinearRegression
	from sklearn.svm import LinearSVR
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.ensemble import GradientBoostingRegressor


	from sklearn.metrics import r2_score,mean_squared_error
	from sklearn.model_selection import train_test_split,KFold,cross_val_score
	from sklearn.preprocessing import StandardScaler

	csv_path = "src/data.csv"

	df = pd.read_csv(csv_path)
	columns = [column for column in df.columns]

	new_df,lambda_values = box_cox_transformation(df,df["longitude"],columns=columns)
	df_tuned = outlier_detection(new_df,columns)

	# print(new_df.head())
	def main():

	X = df_tuned.drop("medianHouseValue",axis=1).values
	y = df_tuned["medianHouseValue"].values


	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=44)



	scaler = StandardScaler()
	scaler.fit(X_train)
	X_train_scaled = scaler.transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	LinReg = LinearRegression()
	svm = LinearSVR(max_iter=10000,C=11,random_state=42)
	ranfor = RandomForestRegressor(n_estimators=102, random_state=42)
	gradboost = GradientBoostingRegressor(n_estimators=90, learning_rate=0.2, max_depth=3, random_state=42)

	models = [LinReg,svm,ranfor,gradboost]

	for model in models:

	model.fit(X_train_scaled,y_train)
	y_pred = model.predict(X_test_scaled)

	mse = mean_squared_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)
	print(f"Performance of {model} :- ")
	print(f"Mean Squared Error: {mse}")
	print(f"R-squared: {r2}\n")


	# scalers = StandardScaler()
	# X_scaled = scalers.fit_transform(X)

	# ran = RandomForestRegressor(n_estimators=102, random_state=42)

	# kf = KFold(n_splits=5, shuffle=True, random_state=42)

	# # Performing k-fold cross-validation
	# mse_scores = cross_val_score(ran, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
	# r2_scores = cross_val_score(ran, X_scaled, y, cv=kf, scoring='r2')

	# mse_scores = -mse_scores

	# for fold, (mse, r2) in enumerate(zip(mse_scores, r2_scores), 1):
	# print(f"Fold {fold}:-")
	# print(f"Mean Squared Error: {mse}")
	# print(f"R-squared: {r2}\n")

	# print("Average Performance Across Folds:")
	# print(f"Mean Squared Error: {mse_scores.mean()}")
	# print(f"R-squared: {r2_scores.mean()}")

	model_filename = "ranfor_model.joblib"
	scaler_filename = "scaler.joblib"

	# saving the model and the scaler object
	joblib.dump(ranfor,model_filename)
	joblib.dump(scaler,scaler_filename)

	print(f"Random forest model saved to {model_filename}")
	print(f"Scaler saved to {scaler_filename}")

	if __name__ == "__main__":
	main()