Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- src/model.py +99 -0
- src/predict.py +72 -0
- src/preprocessing.py +95 -0
- src/ranfor_model.joblib +3 -0
- src/scaler.joblib +3 -0
src/model.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append('..')
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import os
|
6 |
+
import random
|
7 |
+
import joblib
|
8 |
+
from time import perf_counter
|
9 |
+
|
10 |
+
from src.preprocessing import outlier_detection,box_cox_transformation
|
11 |
+
from sklearn.linear_model import LinearRegression
|
12 |
+
from sklearn.svm import LinearSVR
|
13 |
+
from sklearn.ensemble import RandomForestRegressor
|
14 |
+
from sklearn.ensemble import GradientBoostingRegressor
|
15 |
+
|
16 |
+
|
17 |
+
from sklearn.metrics import r2_score,mean_squared_error
|
18 |
+
from sklearn.model_selection import train_test_split,KFold,cross_val_score
|
19 |
+
from sklearn.preprocessing import StandardScaler
|
20 |
+
|
21 |
+
current_script_dir = os.path.dirname(__file__)
|
22 |
+
csv_path = os.path.join(current_script_dir,"../data/cal_housing.csv")
|
23 |
+
|
24 |
+
df = pd.read_csv(csv_path)
|
25 |
+
columns = [column for column in df.columns]
|
26 |
+
|
27 |
+
new_df,lambda_values = box_cox_transformation(df,df["longitude"],columns=columns)
|
28 |
+
df_tuned = outlier_detection(new_df,columns)
|
29 |
+
|
30 |
+
# print(new_df.head())
|
31 |
+
def main():
|
32 |
+
|
33 |
+
X = df_tuned.drop("medianHouseValue",axis=1).values
|
34 |
+
y = df_tuned["medianHouseValue"].values
|
35 |
+
|
36 |
+
|
37 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
38 |
+
X, y, test_size=0.2, random_state=44)
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
scaler = StandardScaler()
|
43 |
+
scaler.fit(X_train)
|
44 |
+
X_train_scaled = scaler.transform(X_train)
|
45 |
+
X_test_scaled = scaler.transform(X_test)
|
46 |
+
|
47 |
+
LinReg = LinearRegression()
|
48 |
+
svm = LinearSVR(max_iter=10000,C=11,random_state=42)
|
49 |
+
ranfor = RandomForestRegressor(n_estimators=102, random_state=42)
|
50 |
+
gradboost = GradientBoostingRegressor(n_estimators=90, learning_rate=0.2, max_depth=3, random_state=42)
|
51 |
+
|
52 |
+
models = [LinReg,svm,ranfor,gradboost]
|
53 |
+
|
54 |
+
for model in models:
|
55 |
+
|
56 |
+
model.fit(X_train_scaled,y_train)
|
57 |
+
y_pred = model.predict(X_test_scaled)
|
58 |
+
|
59 |
+
mse = mean_squared_error(y_test, y_pred)
|
60 |
+
r2 = r2_score(y_test, y_pred)
|
61 |
+
print(f"Performance of {model} :- ")
|
62 |
+
print(f"Mean Squared Error: {mse}")
|
63 |
+
print(f"R-squared: {r2}\n")
|
64 |
+
|
65 |
+
|
66 |
+
# scalers = StandardScaler()
|
67 |
+
# X_scaled = scalers.fit_transform(X)
|
68 |
+
|
69 |
+
# ran = RandomForestRegressor(n_estimators=102, random_state=42)
|
70 |
+
|
71 |
+
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
72 |
+
|
73 |
+
# # Performing k-fold cross-validation
|
74 |
+
# mse_scores = cross_val_score(ran, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
|
75 |
+
# r2_scores = cross_val_score(ran, X_scaled, y, cv=kf, scoring='r2')
|
76 |
+
|
77 |
+
# mse_scores = -mse_scores
|
78 |
+
|
79 |
+
# for fold, (mse, r2) in enumerate(zip(mse_scores, r2_scores), 1):
|
80 |
+
# print(f"Fold {fold}:-")
|
81 |
+
# print(f"Mean Squared Error: {mse}")
|
82 |
+
# print(f"R-squared: {r2}\n")
|
83 |
+
|
84 |
+
# print("Average Performance Across Folds:")
|
85 |
+
# print(f"Mean Squared Error: {mse_scores.mean()}")
|
86 |
+
# print(f"R-squared: {r2_scores.mean()}")
|
87 |
+
|
88 |
+
model_filename = "ranfor_model.joblib"
|
89 |
+
scaler_filename = "scaler.joblib"
|
90 |
+
|
91 |
+
# saving the model and the scaler object
|
92 |
+
joblib.dump(ranfor,model_filename)
|
93 |
+
joblib.dump(scaler,scaler_filename)
|
94 |
+
|
95 |
+
print(f"Random forest model saved to {model_filename}")
|
96 |
+
print(f"Scaler saved to {scaler_filename}")
|
97 |
+
|
98 |
+
if __name__ == "__main__":
|
99 |
+
main()
|
src/predict.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append('..')
|
4 |
+
|
5 |
+
import joblib
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
from scipy.special import inv_boxcox
|
10 |
+
|
11 |
+
from models.model import lambda_values
|
12 |
+
from src.preprocessing import boxcox
|
13 |
+
|
14 |
+
current_scirpt_dir = os.path.dirname(__file__)
|
15 |
+
|
16 |
+
model_path = os.path.join(current_scirpt_dir,"ranfor_model.joblib")
|
17 |
+
scaler_path = os.path.join(current_scirpt_dir,"scaler.joblib")
|
18 |
+
|
19 |
+
def load_model():
|
20 |
+
model = joblib.load(model_path)
|
21 |
+
return model
|
22 |
+
|
23 |
+
def load_std_scaler():
|
24 |
+
scaler = joblib.load(scaler_path)
|
25 |
+
return scaler
|
26 |
+
|
27 |
+
|
28 |
+
constant = 1e-5
|
29 |
+
min_value_longitude = -124.35
|
30 |
+
|
31 |
+
columns = ["longitude","latitude","housingMedianAge","totalRooms","totalBedrooms","population","households","medianIncome"]
|
32 |
+
|
33 |
+
lamda_values = [value for value in lambda_values.values()]
|
34 |
+
|
35 |
+
|
36 |
+
def make_input(input_data:list,lamda_values:list,scaler) -> list:
|
37 |
+
|
38 |
+
input_data[0] += abs(min_value_longitude) + constant
|
39 |
+
transformed_data = []
|
40 |
+
|
41 |
+
#since input_data has 8 elements, the element of lamda_values(for medianHouseValue)
|
42 |
+
#will be ignored in for loop due to the zip function
|
43 |
+
|
44 |
+
for data,lamda_value in zip(input_data,lamda_values):
|
45 |
+
transformed_data.append(boxcox(data,lmbda=lamda_value))
|
46 |
+
|
47 |
+
transformed_data = np.array(transformed_data).reshape(1,-1)
|
48 |
+
transformed_data_scaled = scaler.transform(transformed_data)
|
49 |
+
|
50 |
+
return transformed_data_scaled
|
51 |
+
|
52 |
+
def make_prediction(input_data:list,model) -> list:
|
53 |
+
|
54 |
+
pred_value_transformed = model.predict(input_data)
|
55 |
+
pred_value = inv_boxcox(pred_value_transformed,lamda_values[-1])
|
56 |
+
return pred_value
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
if __name__ =="__main__":
|
61 |
+
actual_output = 52900
|
62 |
+
new_data = [-1.1906e+02, 3.6150e+01, 2.5000e+01, 2.4020e+03, 4.7800e+02,
|
63 |
+
1.5270e+03, 4.6100e+02, 2.3194e+00]
|
64 |
+
scaler = load_std_scaler()
|
65 |
+
|
66 |
+
tranformed_data = make_input(new_data,lamda_values=lamda_values,scaler=scaler)
|
67 |
+
|
68 |
+
model = load_model()
|
69 |
+
|
70 |
+
pred_value = make_prediction(tranformed_data,model=model)
|
71 |
+
print(pred_value)
|
72 |
+
print(actual_output)
|
src/preprocessing.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import numpy as np
|
4 |
+
from scipy.stats import boxcox,zscore
|
5 |
+
from scipy.special import inv_boxcox
|
6 |
+
import os
|
7 |
+
|
8 |
+
#checking for null values
|
9 |
+
# df.isna().sum()
|
10 |
+
|
11 |
+
# df.info()
|
12 |
+
|
13 |
+
def histogram(df:pd.DataFrame,nrows:int,ncols:int,figsize:tuple,columns:list):
|
14 |
+
fig,axes = plt.subplots(nrows=nrows,ncols=ncols,figsize=figsize)
|
15 |
+
axes = axes.flatten()
|
16 |
+
for i,ax in enumerate(axes):
|
17 |
+
ax.hist(df[columns[i]],bins="auto")
|
18 |
+
ax.set_xlabel(f"{columns[i]}")
|
19 |
+
|
20 |
+
plt.subplots_adjust(hspace=0.2, wspace=0.6)
|
21 |
+
|
22 |
+
return plt.show()
|
23 |
+
|
24 |
+
def boxplot(df:pd.DataFrame,nrows:int,ncols:int,figsize:tuple,columns:list):
|
25 |
+
fig,axes = plt.subplots(nrows=nrows,ncols=ncols,figsize=figsize)
|
26 |
+
axes = axes.flatten()
|
27 |
+
for i,ax in enumerate(axes):
|
28 |
+
ax.boxplot(df[columns[i]])
|
29 |
+
ax.set_xlabel(f'{columns[i]}')
|
30 |
+
|
31 |
+
plt.subplots_adjust(hspace=0.2, wspace=0.6)
|
32 |
+
return plt.show()
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def box_cox_transformation(df: pd.DataFrame,negative_or_zero_column,columns:list) -> pd.DataFrame:
|
37 |
+
min_value = negative_or_zero_column.min()
|
38 |
+
constant = 1e-5
|
39 |
+
# print(1+constant)
|
40 |
+
negative_or_zero_column = pd.Series(negative_or_zero_column + abs(min_value) + constant)
|
41 |
+
|
42 |
+
box_cox_data = {}
|
43 |
+
lambda_values = {}
|
44 |
+
|
45 |
+
box_cox_data["longitude"],lambda_values["longitude"] = \
|
46 |
+
boxcox(negative_or_zero_column)
|
47 |
+
|
48 |
+
for column in columns:
|
49 |
+
if column =="longitude":
|
50 |
+
continue
|
51 |
+
box_cox_data[column],lambda_values[column] = boxcox(df[column])
|
52 |
+
|
53 |
+
transformed_df = pd.DataFrame(box_cox_data)
|
54 |
+
return (transformed_df,lambda_values)
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
def inverse_box_cox(transformed_data,lambda_value):
|
59 |
+
|
60 |
+
return inv_boxcox(transformed_data,lambda_value)
|
61 |
+
|
62 |
+
|
63 |
+
def single_value_boxcox(datas:list,lambda_values:list) -> list:
|
64 |
+
transformed_values = list()
|
65 |
+
for data,lambda_value in zip(datas,lambda_values):
|
66 |
+
transformed_values.append(boxcox(data,lmbda=lambda_value))
|
67 |
+
return transformed_values
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
# z_scores = zscore(new_df["totalRooms"])
|
74 |
+
# outliers = (z_scores > 3) | (z_scores < -3)
|
75 |
+
# print(f"Total outliers for totalRooms : {outliers.sum()}")
|
76 |
+
|
77 |
+
# outlier_indices = outliers[outliers==True].index.values
|
78 |
+
|
79 |
+
def outlier_detection(df:pd.DataFrame,columns:list) -> pd.DataFrame:
|
80 |
+
outliers_dict = {}
|
81 |
+
for column in columns:
|
82 |
+
z_scores = zscore(df[column])
|
83 |
+
outliers = (z_scores > 3) | (z_scores < -3)
|
84 |
+
#print(f"Total outliers for {column} : {outliers.sum()}")
|
85 |
+
if not outliers.sum():
|
86 |
+
continue
|
87 |
+
outliers_dict[column] = list(outliers[outliers==True].index.values)
|
88 |
+
indices = np.array([value for sublist in outliers_dict.values() for value in sublist])
|
89 |
+
unique_elements, counts = np.unique(indices,return_counts=True)
|
90 |
+
new_df = df.drop(unique_elements)
|
91 |
+
return new_df
|
92 |
+
# print(df1["medianHouseValue"].mean(),df1["medianHouseValue"].std())
|
93 |
+
# print(df1.shape)
|
94 |
+
# df1.head()
|
95 |
+
# df1.to_csv("cal_housing_tuned.csv",index=False)
|
src/ranfor_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fea821fcbcc1a15ca676ec3f687e389a09537f87b5d0d2890d63caab2170c965
|
3 |
+
size 144670081
|
src/scaler.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bde7795e728ea3533e4db5e803a3b2c212f0f66b00814dfd1d901a8b4e81474f
|
3 |
+
size 807
|