KaiquanMah commited on
Commit
6454725
·
verified ·
1 Parent(s): 64be1b7

Yair fixed categorical, ID, temporal cols and pre-processing [no more errors for XGBoost]

Browse files
Files changed (1) hide show
  1. data_loader.py +44 -11
data_loader.py CHANGED
@@ -11,14 +11,15 @@ from imblearn.over_sampling import SMOTE
11
  # CONFIGURATION
12
  # ===========================
13
 
14
- TRAIN_PATH = "~/Downloads/train_dataset_full - train_dataset_full (1).csv"
15
- TEST_PATH = "~/Downloads/X_test_1st (1).csv" # Replace with actual test dataset path
 
16
 
17
- CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
 
18
  TARGET_COLUMN = "is_click"
19
-
20
  FEATURE_COLUMNS = [
21
- "age_level", "gender", "product", "campaign_id", "webpage_id",
22
  "product_category_1", "product_category_2", "user_group_id",
23
  "user_depth", "city_development_index", "var_1"
24
  ]
@@ -30,6 +31,7 @@ AGGREGATED_COLUMNS = [
30
  "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
31
  ]
32
 
 
33
  # ===========================
34
  # LOAD DATASETS
35
  # ===========================
@@ -37,8 +39,37 @@ AGGREGATED_COLUMNS = [
37
  def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
38
  """Load train & test datasets, handling missing values."""
39
  train_df = pd.read_csv(train_path)
 
 
 
 
40
  test_df = pd.read_csv(test_path)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Fill missing values
43
  train_df.fillna(-1, inplace=True)
44
  test_df.fillna(-1, inplace=True)
@@ -108,11 +139,12 @@ def preprocess_data(df, test_df, categorical_columns):
108
 
109
  numerical_columns = [col for col in FEATURE_COLUMNS + AGGREGATED_COLUMNS if col not in categorical_columns]
110
 
111
- scaler = StandardScaler()
112
- df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
113
- test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])
 
114
 
115
- return df, test_df, label_encoders, scaler
116
 
117
 
118
  # ===========================
@@ -122,7 +154,7 @@ def preprocess_data(df, test_df, categorical_columns):
122
  def split_and_balance_data(df, target_column):
123
  """Splits data into training and validation sets, applies SMOTE to balance classes."""
124
 
125
- X = df[FEATURE_COLUMNS + AGGREGATED_COLUMNS]
126
  y = df[target_column]
127
 
128
  # Handle class imbalance using SMOTE
@@ -172,7 +204,7 @@ def load_and_process_data():
172
 
173
  df, test_df = load_data()
174
  df, test_df = add_aggregated_features(df, test_df)
175
- df, test_df, label_encoders, scaler = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
176
  X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
177
 
178
  return X_train, X_val, y_train, y_val, test_df
@@ -182,3 +214,4 @@ if __name__ == "__main__":
182
  print("🔹 Loading and processing data...")
183
  X_train, X_val, y_train, y_val, test_df = load_and_process_data()
184
  print("✅ Data successfully loaded and processed!")
 
 
11
  # CONFIGURATION
12
  # ===========================
13
 
14
+ TRAIN_PATH = "data/train_dataset_full - train_dataset_full.csv"
15
+ # TRAIN_PATH = "data/train_dataset_full - train_dataset_partial_for_testing.csv"
16
+ TEST_PATH = "data/X_test_1st.csv" # Replace with actual test dataset path
17
 
18
+ CATEGORICAL_COLUMNS = ["gender", "product",]
19
+ IDS_COLUMNS = [ "user_id", "session_id", "campaign_id", "webpage_id"]
20
  TARGET_COLUMN = "is_click"
 
21
  FEATURE_COLUMNS = [
22
+ "age_level", "gender", "product",
23
  "product_category_1", "product_category_2", "user_group_id",
24
  "user_depth", "city_development_index", "var_1"
25
  ]
 
31
  "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
32
  ]
33
 
34
+ TEMPORAL_COLUMNS = ["year", "month", "day", "hour", "minute", "weekday"]
35
  # ===========================
36
  # LOAD DATASETS
37
  # ===========================
 
39
  def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
40
  """Load train & test datasets, handling missing values."""
41
  train_df = pd.read_csv(train_path)
42
+ y_train = train_df[TARGET_COLUMN]
43
+ train_df = train_df[~y_train.isnull()]
44
+
45
+
46
  test_df = pd.read_csv(test_path)
47
 
48
+ train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
49
+ test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
50
+ train_df["DateTime"].fillna(train_df["DateTime"].mode()[0], inplace=True)
51
+ test_df["DateTime"].fillna(test_df["DateTime"].mode()[0], inplace=True)
52
+
53
+ if "DateTime" in train_df.columns:
54
+ train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
55
+ train_df["year"] = train_df["DateTime"].dt.year
56
+ train_df["month"] = train_df["DateTime"].dt.month
57
+ train_df["day"] = train_df["DateTime"].dt.day
58
+ train_df["hour"] = train_df["DateTime"].dt.hour
59
+ train_df["minute"] = train_df["DateTime"].dt.minute
60
+ train_df["weekday"] = train_df["DateTime"].dt.weekday
61
+ train_df.drop("DateTime", axis=1, inplace=True)
62
+
63
+ if "DateTime" in test_df.columns:
64
+ test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
65
+ test_df["year"] = test_df["DateTime"].dt.year
66
+ test_df["month"] = test_df["DateTime"].dt.month
67
+ test_df["day"] = test_df["DateTime"].dt.day
68
+ test_df["hour"] = test_df["DateTime"].dt.hour
69
+ test_df["minute"] = test_df["DateTime"].dt.minute
70
+ test_df["weekday"] = test_df["DateTime"].dt.weekday
71
+ test_df.drop("DateTime", axis=1, inplace=True)
72
+
73
  # Fill missing values
74
  train_df.fillna(-1, inplace=True)
75
  test_df.fillna(-1, inplace=True)
 
139
 
140
  numerical_columns = [col for col in FEATURE_COLUMNS + AGGREGATED_COLUMNS if col not in categorical_columns]
141
 
142
+ # scaler = StandardScaler()
143
+ # df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
144
+ # test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])
145
+
146
 
147
+ return df, test_df, label_encoders,# scaler
148
 
149
 
150
  # ===========================
 
154
  def split_and_balance_data(df, target_column):
155
  """Splits data into training and validation sets, applies SMOTE to balance classes."""
156
 
157
+ X = df[IDS_COLUMNS + FEATURE_COLUMNS + AGGREGATED_COLUMNS + TEMPORAL_COLUMNS]
158
  y = df[target_column]
159
 
160
  # Handle class imbalance using SMOTE
 
204
 
205
  df, test_df = load_data()
206
  df, test_df = add_aggregated_features(df, test_df)
207
+ df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
208
  X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
209
 
210
  return X_train, X_val, y_train, y_val, test_df
 
214
  print("🔹 Loading and processing data...")
215
  X_train, X_val, y_train, y_val, test_df = load_and_process_data()
216
  print("✅ Data successfully loaded and processed!")
217
+