pujanpaudel commited on
Commit
468c485
·
verified ·
1 Parent(s): 5bec606

Upload 5 files

Browse files
src/model.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('..')
3
+
4
+ import pandas as pd
5
+ import os
6
+ import random
7
+ import joblib
8
+ from time import perf_counter
9
+
10
+ from src.preprocessing import outlier_detection,box_cox_transformation
11
+ from sklearn.linear_model import LinearRegression
12
+ from sklearn.svm import LinearSVR
13
+ from sklearn.ensemble import RandomForestRegressor
14
+ from sklearn.ensemble import GradientBoostingRegressor
15
+
16
+
17
+ from sklearn.metrics import r2_score,mean_squared_error
18
+ from sklearn.model_selection import train_test_split,KFold,cross_val_score
19
+ from sklearn.preprocessing import StandardScaler
20
+
21
+ current_script_dir = os.path.dirname(__file__)
22
+ csv_path = os.path.join(current_script_dir,"../data/cal_housing.csv")
23
+
24
+ df = pd.read_csv(csv_path)
25
+ columns = [column for column in df.columns]
26
+
27
+ new_df,lambda_values = box_cox_transformation(df,df["longitude"],columns=columns)
28
+ df_tuned = outlier_detection(new_df,columns)
29
+
30
+ # print(new_df.head())
31
+ def main():
32
+
33
+ X = df_tuned.drop("medianHouseValue",axis=1).values
34
+ y = df_tuned["medianHouseValue"].values
35
+
36
+
37
+ X_train, X_test, y_train, y_test = train_test_split(
38
+ X, y, test_size=0.2, random_state=44)
39
+
40
+
41
+
42
+ scaler = StandardScaler()
43
+ scaler.fit(X_train)
44
+ X_train_scaled = scaler.transform(X_train)
45
+ X_test_scaled = scaler.transform(X_test)
46
+
47
+ LinReg = LinearRegression()
48
+ svm = LinearSVR(max_iter=10000,C=11,random_state=42)
49
+ ranfor = RandomForestRegressor(n_estimators=102, random_state=42)
50
+ gradboost = GradientBoostingRegressor(n_estimators=90, learning_rate=0.2, max_depth=3, random_state=42)
51
+
52
+ models = [LinReg,svm,ranfor,gradboost]
53
+
54
+ for model in models:
55
+
56
+ model.fit(X_train_scaled,y_train)
57
+ y_pred = model.predict(X_test_scaled)
58
+
59
+ mse = mean_squared_error(y_test, y_pred)
60
+ r2 = r2_score(y_test, y_pred)
61
+ print(f"Performance of {model} :- ")
62
+ print(f"Mean Squared Error: {mse}")
63
+ print(f"R-squared: {r2}\n")
64
+
65
+
66
+ # scalers = StandardScaler()
67
+ # X_scaled = scalers.fit_transform(X)
68
+
69
+ # ran = RandomForestRegressor(n_estimators=102, random_state=42)
70
+
71
+ # kf = KFold(n_splits=5, shuffle=True, random_state=42)
72
+
73
+ # # Performing k-fold cross-validation
74
+ # mse_scores = cross_val_score(ran, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
75
+ # r2_scores = cross_val_score(ran, X_scaled, y, cv=kf, scoring='r2')
76
+
77
+ # mse_scores = -mse_scores
78
+
79
+ # for fold, (mse, r2) in enumerate(zip(mse_scores, r2_scores), 1):
80
+ # print(f"Fold {fold}:-")
81
+ # print(f"Mean Squared Error: {mse}")
82
+ # print(f"R-squared: {r2}\n")
83
+
84
+ # print("Average Performance Across Folds:")
85
+ # print(f"Mean Squared Error: {mse_scores.mean()}")
86
+ # print(f"R-squared: {r2_scores.mean()}")
87
+
88
+ model_filename = "ranfor_model.joblib"
89
+ scaler_filename = "scaler.joblib"
90
+
91
+ # saving the model and the scaler object
92
+ joblib.dump(ranfor,model_filename)
93
+ joblib.dump(scaler,scaler_filename)
94
+
95
+ print(f"Random forest model saved to {model_filename}")
96
+ print(f"Scaler saved to {scaler_filename}")
97
+
98
+ if __name__ == "__main__":
99
+ main()
src/predict.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append('..')
4
+
5
+ import joblib
6
+ from sklearn.preprocessing import StandardScaler
7
+ import pandas as pd
8
+ import numpy as np
9
+ from scipy.special import inv_boxcox
10
+
11
+ from models.model import lambda_values
12
+ from src.preprocessing import boxcox
13
+
14
+ current_scirpt_dir = os.path.dirname(__file__)
15
+
16
+ model_path = os.path.join(current_scirpt_dir,"ranfor_model.joblib")
17
+ scaler_path = os.path.join(current_scirpt_dir,"scaler.joblib")
18
+
19
+ def load_model():
20
+ model = joblib.load(model_path)
21
+ return model
22
+
23
+ def load_std_scaler():
24
+ scaler = joblib.load(scaler_path)
25
+ return scaler
26
+
27
+
28
+ constant = 1e-5
29
+ min_value_longitude = -124.35
30
+
31
+ columns = ["longitude","latitude","housingMedianAge","totalRooms","totalBedrooms","population","households","medianIncome"]
32
+
33
+ lamda_values = [value for value in lambda_values.values()]
34
+
35
+
36
+ def make_input(input_data:list,lamda_values:list,scaler) -> list:
37
+
38
+ input_data[0] += abs(min_value_longitude) + constant
39
+ transformed_data = []
40
+
41
+ #since input_data has 8 elements, the element of lamda_values(for medianHouseValue)
42
+ #will be ignored in for loop due to the zip function
43
+
44
+ for data,lamda_value in zip(input_data,lamda_values):
45
+ transformed_data.append(boxcox(data,lmbda=lamda_value))
46
+
47
+ transformed_data = np.array(transformed_data).reshape(1,-1)
48
+ transformed_data_scaled = scaler.transform(transformed_data)
49
+
50
+ return transformed_data_scaled
51
+
52
+ def make_prediction(input_data:list,model) -> list:
53
+
54
+ pred_value_transformed = model.predict(input_data)
55
+ pred_value = inv_boxcox(pred_value_transformed,lamda_values[-1])
56
+ return pred_value
57
+
58
+
59
+
60
+ if __name__ =="__main__":
61
+ actual_output = 52900
62
+ new_data = [-1.1906e+02, 3.6150e+01, 2.5000e+01, 2.4020e+03, 4.7800e+02,
63
+ 1.5270e+03, 4.6100e+02, 2.3194e+00]
64
+ scaler = load_std_scaler()
65
+
66
+ tranformed_data = make_input(new_data,lamda_values=lamda_values,scaler=scaler)
67
+
68
+ model = load_model()
69
+
70
+ pred_value = make_prediction(tranformed_data,model=model)
71
+ print(pred_value)
72
+ print(actual_output)
src/preprocessing.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ from scipy.stats import boxcox,zscore
5
+ from scipy.special import inv_boxcox
6
+ import os
7
+
8
+ #checking for null values
9
+ # df.isna().sum()
10
+
11
+ # df.info()
12
+
13
+ def histogram(df:pd.DataFrame,nrows:int,ncols:int,figsize:tuple,columns:list):
14
+ fig,axes = plt.subplots(nrows=nrows,ncols=ncols,figsize=figsize)
15
+ axes = axes.flatten()
16
+ for i,ax in enumerate(axes):
17
+ ax.hist(df[columns[i]],bins="auto")
18
+ ax.set_xlabel(f"{columns[i]}")
19
+
20
+ plt.subplots_adjust(hspace=0.2, wspace=0.6)
21
+
22
+ return plt.show()
23
+
24
+ def boxplot(df:pd.DataFrame,nrows:int,ncols:int,figsize:tuple,columns:list):
25
+ fig,axes = plt.subplots(nrows=nrows,ncols=ncols,figsize=figsize)
26
+ axes = axes.flatten()
27
+ for i,ax in enumerate(axes):
28
+ ax.boxplot(df[columns[i]])
29
+ ax.set_xlabel(f'{columns[i]}')
30
+
31
+ plt.subplots_adjust(hspace=0.2, wspace=0.6)
32
+ return plt.show()
33
+
34
+
35
+
36
+ def box_cox_transformation(df: pd.DataFrame,negative_or_zero_column,columns:list) -> pd.DataFrame:
37
+ min_value = negative_or_zero_column.min()
38
+ constant = 1e-5
39
+ # print(1+constant)
40
+ negative_or_zero_column = pd.Series(negative_or_zero_column + abs(min_value) + constant)
41
+
42
+ box_cox_data = {}
43
+ lambda_values = {}
44
+
45
+ box_cox_data["longitude"],lambda_values["longitude"] = \
46
+ boxcox(negative_or_zero_column)
47
+
48
+ for column in columns:
49
+ if column =="longitude":
50
+ continue
51
+ box_cox_data[column],lambda_values[column] = boxcox(df[column])
52
+
53
+ transformed_df = pd.DataFrame(box_cox_data)
54
+ return (transformed_df,lambda_values)
55
+
56
+
57
+
58
+ def inverse_box_cox(transformed_data,lambda_value):
59
+
60
+ return inv_boxcox(transformed_data,lambda_value)
61
+
62
+
63
+ def single_value_boxcox(datas:list,lambda_values:list) -> list:
64
+ transformed_values = list()
65
+ for data,lambda_value in zip(datas,lambda_values):
66
+ transformed_values.append(boxcox(data,lmbda=lambda_value))
67
+ return transformed_values
68
+
69
+
70
+
71
+
72
+
73
+ # z_scores = zscore(new_df["totalRooms"])
74
+ # outliers = (z_scores > 3) | (z_scores < -3)
75
+ # print(f"Total outliers for totalRooms : {outliers.sum()}")
76
+
77
+ # outlier_indices = outliers[outliers==True].index.values
78
+
79
+ def outlier_detection(df:pd.DataFrame,columns:list) -> pd.DataFrame:
80
+ outliers_dict = {}
81
+ for column in columns:
82
+ z_scores = zscore(df[column])
83
+ outliers = (z_scores > 3) | (z_scores < -3)
84
+ #print(f"Total outliers for {column} : {outliers.sum()}")
85
+ if not outliers.sum():
86
+ continue
87
+ outliers_dict[column] = list(outliers[outliers==True].index.values)
88
+ indices = np.array([value for sublist in outliers_dict.values() for value in sublist])
89
+ unique_elements, counts = np.unique(indices,return_counts=True)
90
+ new_df = df.drop(unique_elements)
91
+ return new_df
92
+ # print(df1["medianHouseValue"].mean(),df1["medianHouseValue"].std())
93
+ # print(df1.shape)
94
+ # df1.head()
95
+ # df1.to_csv("cal_housing_tuned.csv",index=False)
src/ranfor_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fea821fcbcc1a15ca676ec3f687e389a09537f87b5d0d2890d63caab2170c965
3
+ size 144670081
src/scaler.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bde7795e728ea3533e4db5e803a3b2c212f0f66b00814dfd1d901a8b4e81474f
3
+ size 807