Reyne commited on
Commit
bd52ea7
·
1 Parent(s): fcf9651

Upload log_regress.py

Browse files
Files changed (1) hide show
  1. log_regress.py +315 -0
log_regress.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Creators: Diego Medeiros e Reyne Jasson
3
+ create a pipeline for building a logistic regression model
4
+ and study how does the corona virus changed the sucess
5
+ on school.
6
+ """
7
+
8
+
9
+
10
+ from sklearn.model_selection import train_test_split
11
+
12
+ from sklearn.linear_model import LogisticRegression
13
+
14
+ from sklearn.preprocessing import LabelEncoder
15
+ from sklearn.preprocessing import OneHotEncoder
16
+ from sklearn.preprocessing import MinMaxScaler
17
+ from sklearn.preprocessing import StandardScaler
18
+
19
+ from sklearn.metrics import confusion_matrix
20
+
21
+ from sklearn.pipeline import Pipeline, FeatureUnion
22
+
23
+
24
+ from sklearn.neighbors import LocalOutlierFactor
25
+ from sklearn.base import BaseEstimator, TransformerMixin
26
+
27
+ import matplotlib.pyplot as plt
28
+ import seaborn as sns
29
+
30
+ from sklearn.impute import SimpleImputer
31
+ import pandas as pd
32
+ import numpy as np
33
+
34
+ from joblib import dump
35
+
36
+ import argparse
37
+
38
+ from clearml import Task
39
+
40
+
41
+ #Custom Transformer that extracts columns passed as argument to its constructor
42
+ class FeatureSelector( BaseEstimator, TransformerMixin ):
43
+ #Class Constructor
44
+ def __init__( self, feature_names ):
45
+ self.feature_names = feature_names
46
+
47
+ #Return self nothing else to do here
48
+ def fit( self, X, y = None ):
49
+ return self
50
+
51
+ #Method that describes what we need this transformer to do
52
+ def transform( self, X, y = None ):
53
+ return X[ self.feature_names ]
54
+
55
+
56
+ class CategoricalTransformer( BaseEstimator, TransformerMixin ):
57
+ # Class constructor method that takes one boolean as its argument
58
+ def __init__(self, new_features=True, colnames=None):
59
+ self.new_features = new_features
60
+ self.colnames = colnames
61
+
62
+ #Return self nothing else to do here
63
+ def fit( self, X, y = None ):
64
+ return self
65
+
66
+ def get_feature_names(self):
67
+ return self.colnames.tolist()
68
+
69
+ # Transformer method we wrote for this transformer
70
+ def transform(self, X , y = None):
71
+ df = pd.DataFrame(X,columns=self.colnames)
72
+
73
+ columns = self.colnames
74
+ # Create new features with label Encoding
75
+
76
+ df['grau_academico'].replace({'BACHARELADO':'3', 'LICENCIATURA':'2',
77
+ 'TECNOLÓGICO':'1',"OUTRO":"0"},inplace=True)
78
+
79
+
80
+ print(df.head())
81
+ # update column names
82
+ return df
83
+
84
+ class NumericalTransformer( BaseEstimator, TransformerMixin ):
85
+ # Class constructor method that takes a model parameter as its argument
86
+ # model 0: minmax
87
+ # model 1: standard
88
+ # model 2: without scaler
89
+ def __init__(self, model = 0, colnames=None):
90
+ self.model = model
91
+ self.colnames = colnames
92
+
93
+ #Return self nothing else to do here
94
+ def fit( self, X, y = None ):
95
+ return self
96
+
97
+ # return columns names after transformation
98
+ def get_feature_names(self):
99
+ return self.colnames
100
+
101
+ #Transformer method we wrote for this transformer
102
+ def transform(self, X , y = None ):
103
+ df = pd.DataFrame(X,columns=self.colnames)
104
+
105
+ for coluna in self.colnames:
106
+ df[coluna] = pd.to_numeric(df[coluna],errors='coerce')
107
+ # update columns name
108
+ df.fillna(value=0,inplace=True)
109
+ self.colnames = df.columns.tolist()
110
+
111
+ df['idade'] = 2020 - df['ano_nascimento'].astype(int)
112
+
113
+ # minmax
114
+ if self.model == 0:
115
+ scaler = MinMaxScaler()
116
+ # transform data
117
+ df = scaler.fit_transform(df)
118
+
119
+ elif self.model == 1:
120
+ scaler = StandardScaler()
121
+ # transform data
122
+ df = scaler.fit_transform(df)
123
+ else:
124
+ df = df.values
125
+
126
+ return df
127
+
128
+ def process_args(ARGS:dict,task:Task):
129
+
130
+ logger = task.get_logger()
131
+
132
+ preprocessed_data_task = Task.get_task(task_id=ARGS.task_id)
133
+ # access artifact
134
+ local_csv = preprocessed_data_task.artifacts[ARGS.dataset_name].get_local_copy()
135
+
136
+
137
+ data = pd.read_csv(local_csv,encoding='utf-8',sep=',',dtype=object)
138
+
139
+
140
+ data['ano_nascimento'].fillna(value='2000',inplace=True)
141
+ #create age feature
142
+
143
+ data['ano_nascimento'] = data['ano_nascimento'].astype(int)
144
+ data['renda'] = data['renda'].astype(int)
145
+ # Spliting train.csv into train and validation dataset
146
+ print("Spliting data into train/val")
147
+
148
+ #label replacement
149
+ # Create logical instance from multivalue_feture
150
+ data['local_ou_de_fora'] = (data['estado_origem']==('Rio Grande do Norte'))
151
+
152
+ # Fill nan for "Outro" category
153
+ data['raca'].fillna(value='Não Informado',inplace=True)
154
+ data['area_conhecimento'].fillna(value='Outra',inplace=True)
155
+ data['grau_academico'].fillna(value='OUTRO',inplace=True)
156
+
157
+ # Start label Encoder
158
+
159
+
160
+ data.drop(columns={'estado_origem','cidade_origem'},inplace=True)
161
+
162
+ data['descricao'].replace({'APROVADO':'1',"FALHOU":"0","REPROVADO POR NOTA E FALTA":"0"},inplace=True)
163
+ data['descricao'] = pd.to_numeric(data['descricao'],errors='coerce')
164
+ data['descricao'].fillna(value='0',inplace=True)
165
+
166
+ print(data.dtypes)
167
+ # split-out train/validation and test dataset
168
+ x_train,x_val,y_train,y_val = train_test_split(data.drop(columns=['descricao']),
169
+ data['descricao'],
170
+ test_size=0.2,
171
+ random_state=2,
172
+ shuffle=True,
173
+ stratify = data['descricao'] if ARGS.stratify else None)
174
+
175
+ print("x train: {}".format(x_train.shape))
176
+ print("y train: {}".format(y_train.shape))
177
+ print("x val: {}".format(x_val.shape))
178
+ print("y val: {}".format(y_val.shape))
179
+ print("x train: {}".format(list(x_train.columns)))
180
+ print("Removal Outliers")
181
+ # temporary variable
182
+ x = x_train.select_dtypes("int64").copy()
183
+
184
+ # identify outlier in the dataset
185
+ lof = LocalOutlierFactor()
186
+ outlier = lof.fit_predict(x)
187
+ mask = (outlier != -1)
188
+
189
+ print("x_train shape [original]: {}".format(x_train.shape))
190
+ print("x_train shape [outlier removal]: {}".format(x_train.loc[mask,:].shape))
191
+
192
+ # dataset without outlier, note this step could be done during the preprocesing stage
193
+ x_train = x_train.loc[mask,:].copy()
194
+ y_train = y_train[mask].copy()
195
+ print("Encoding Target Variable")
196
+ # define a categorical encoding for target variable
197
+ le = LabelEncoder()
198
+
199
+ # fit and transform y_train
200
+ y_train = le.fit_transform(y_train)
201
+ # transform y_test (avoiding data leakage)
202
+ y_val = le.transform(y_val)
203
+ print(y_train)
204
+ print("Classes [0, 1]: {}".format(le.inverse_transform([0, 1])))
205
+
206
+ # Pipeline generation
207
+ print("Pipeline generation")
208
+
209
+ # Categrical features to pass down the categorical pipeline
210
+ categorical_features = x_train.select_dtypes(["object",'bool']).columns.to_list()
211
+
212
+ # Numerical features to pass down the numerical pipeline
213
+ numerical_features = x_train.select_dtypes("int64").columns.to_list()
214
+ # Defining the steps in the categorical pipeline
215
+
216
+ categorical_pipeline = Pipeline(steps = [('cat_selector',FeatureSelector(categorical_features)),
217
+ ('imputer_cat', SimpleImputer(strategy="most_frequent")),
218
+ ('cat_encoder',OneHotEncoder(sparse=False,drop="first"))
219
+ ]
220
+ )
221
+ # Defining the steps in the numerical pipeline
222
+ print(FeatureSelector(numerical_features))
223
+
224
+ numerical_pipeline = Pipeline(steps = [('num_selector', FeatureSelector(numerical_features)),
225
+ ('imputer_cat', SimpleImputer(strategy="median")),
226
+ ('num_transformer', NumericalTransformer(colnames=numerical_features))
227
+ ]
228
+ )
229
+
230
+ # Combining numerical and categorical piepline into one full big pipeline horizontally
231
+ # using FeatureUnion
232
+ full_pipeline_preprocessing = FeatureUnion(transformer_list = [('cat_pipeline', categorical_pipeline),
233
+ ('num_pipeline', numerical_pipeline)
234
+ ]
235
+ )
236
+
237
+ # The full pipeline
238
+ pipe = Pipeline(steps = [('full_pipeline', full_pipeline_preprocessing),
239
+ ("classifier",LogisticRegression())
240
+ ]
241
+ )
242
+
243
+ # training
244
+ print("Training{}".format(list(x_train.dtypes)))
245
+ pipe.fit(x_train,y_train)
246
+
247
+ # predict
248
+ print("Infering")
249
+ predict = pipe.predict(x_val)
250
+
251
+ print(predict)
252
+
253
+ return pipe,x_val,y_val
254
+
255
+
256
+
257
+ if __name__ == "__main__":
258
+
259
+
260
+ parser = argparse.ArgumentParser(
261
+ description="The training script",
262
+ fromfile_prefix_chars="@",
263
+ )
264
+
265
+ parser.add_argument(
266
+ "--model_export",
267
+ type=str,
268
+ help="Fully-qualified artifact name for the exported model to clearML",
269
+ default='regressao_logistica.joblib'
270
+ )
271
+
272
+ parser.add_argument(
273
+ "--dataset_name",
274
+ type=str,
275
+ default='processed_data',
276
+ help="The dataset name to generate model"
277
+
278
+ )
279
+
280
+ parser.add_argument(
281
+ "--task_id",
282
+ type=str,
283
+ help="Task ID where the data was generated",
284
+ default='71845909e9b643fca92e5902c32265a1'
285
+ )
286
+
287
+
288
+ parser.add_argument(
289
+ "--stratify",
290
+ type=int,
291
+ help="Name for column which to stratify",
292
+ default=None
293
+ )
294
+
295
+ ARGS = parser.parse_args()
296
+
297
+ task = Task.init(project_name="a ML example",task_name="logist training")
298
+
299
+
300
+ # process the arguments
301
+
302
+ clf,x_val,y_val = process_args(ARGS,task)
303
+
304
+ y_predict = clf.predict(x_val)
305
+
306
+ #ClearML will automatically save anything reported to matplotlib!
307
+ cm = confusion_matrix(y_true=y_val,y_pred=y_predict,normalize='true')
308
+ cmap = sns.diverging_palette(10, 240, as_cmap=True)
309
+ sns.heatmap(cm,cmap=cmap, annot=True)
310
+ plt.show()
311
+
312
+ print(f"Exporting model {ARGS.model_export}")
313
+ dump(clf, ARGS.model_export)
314
+ task.upload_artifact("log_regress_classifier", ARGS.model_export)
315
+