Yair fixed categorical, ID, temporal cols and pre-processing [no more errors for XGBoost]
Browse files- data_loader.py +44 -11
data_loader.py
CHANGED
@@ -11,14 +11,15 @@ from imblearn.over_sampling import SMOTE
|
|
11 |
# CONFIGURATION
|
12 |
# ===========================
|
13 |
|
14 |
-
TRAIN_PATH = "
|
15 |
-
|
|
|
16 |
|
17 |
-
CATEGORICAL_COLUMNS = ["gender", "product",
|
|
|
18 |
TARGET_COLUMN = "is_click"
|
19 |
-
|
20 |
FEATURE_COLUMNS = [
|
21 |
-
"age_level", "gender", "product",
|
22 |
"product_category_1", "product_category_2", "user_group_id",
|
23 |
"user_depth", "city_development_index", "var_1"
|
24 |
]
|
@@ -30,6 +31,7 @@ AGGREGATED_COLUMNS = [
|
|
30 |
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
|
31 |
]
|
32 |
|
|
|
33 |
# ===========================
|
34 |
# LOAD DATASETS
|
35 |
# ===========================
|
@@ -37,8 +39,37 @@ AGGREGATED_COLUMNS = [
|
|
37 |
def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
|
38 |
"""Load train & test datasets, handling missing values."""
|
39 |
train_df = pd.read_csv(train_path)
|
|
|
|
|
|
|
|
|
40 |
test_df = pd.read_csv(test_path)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Fill missing values
|
43 |
train_df.fillna(-1, inplace=True)
|
44 |
test_df.fillna(-1, inplace=True)
|
@@ -108,11 +139,12 @@ def preprocess_data(df, test_df, categorical_columns):
|
|
108 |
|
109 |
numerical_columns = [col for col in FEATURE_COLUMNS + AGGREGATED_COLUMNS if col not in categorical_columns]
|
110 |
|
111 |
-
scaler = StandardScaler()
|
112 |
-
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
|
113 |
-
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])
|
|
|
114 |
|
115 |
-
return df, test_df, label_encoders
|
116 |
|
117 |
|
118 |
# ===========================
|
@@ -122,7 +154,7 @@ def preprocess_data(df, test_df, categorical_columns):
|
|
122 |
def split_and_balance_data(df, target_column):
|
123 |
"""Splits data into training and validation sets, applies SMOTE to balance classes."""
|
124 |
|
125 |
-
X = df[FEATURE_COLUMNS + AGGREGATED_COLUMNS]
|
126 |
y = df[target_column]
|
127 |
|
128 |
# Handle class imbalance using SMOTE
|
@@ -172,7 +204,7 @@ def load_and_process_data():
|
|
172 |
|
173 |
df, test_df = load_data()
|
174 |
df, test_df = add_aggregated_features(df, test_df)
|
175 |
-
df, test_df, label_encoders
|
176 |
X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
|
177 |
|
178 |
return X_train, X_val, y_train, y_val, test_df
|
@@ -182,3 +214,4 @@ if __name__ == "__main__":
|
|
182 |
print("🔹 Loading and processing data...")
|
183 |
X_train, X_val, y_train, y_val, test_df = load_and_process_data()
|
184 |
print("✅ Data successfully loaded and processed!")
|
|
|
|
11 |
# CONFIGURATION
|
12 |
# ===========================
|
13 |
|
14 |
+
TRAIN_PATH = "data/train_dataset_full - train_dataset_full.csv"
|
15 |
+
# TRAIN_PATH = "data/train_dataset_full - train_dataset_partial_for_testing.csv"
|
16 |
+
TEST_PATH = "data/X_test_1st.csv" # Replace with actual test dataset path
|
17 |
|
18 |
+
CATEGORICAL_COLUMNS = ["gender", "product",]
|
19 |
+
IDS_COLUMNS = [ "user_id", "session_id", "campaign_id", "webpage_id"]
|
20 |
TARGET_COLUMN = "is_click"
|
|
|
21 |
FEATURE_COLUMNS = [
|
22 |
+
"age_level", "gender", "product",
|
23 |
"product_category_1", "product_category_2", "user_group_id",
|
24 |
"user_depth", "city_development_index", "var_1"
|
25 |
]
|
|
|
31 |
"unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
|
32 |
]
|
33 |
|
34 |
+
TEMPORAL_COLUMNS = ["year", "month", "day", "hour", "minute", "weekday"]
|
35 |
# ===========================
|
36 |
# LOAD DATASETS
|
37 |
# ===========================
|
|
|
39 |
def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
|
40 |
"""Load train & test datasets, handling missing values."""
|
41 |
train_df = pd.read_csv(train_path)
|
42 |
+
y_train = train_df[TARGET_COLUMN]
|
43 |
+
train_df = train_df[~y_train.isnull()]
|
44 |
+
|
45 |
+
|
46 |
test_df = pd.read_csv(test_path)
|
47 |
|
48 |
+
train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
|
49 |
+
test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
|
50 |
+
train_df["DateTime"].fillna(train_df["DateTime"].mode()[0], inplace=True)
|
51 |
+
test_df["DateTime"].fillna(test_df["DateTime"].mode()[0], inplace=True)
|
52 |
+
|
53 |
+
if "DateTime" in train_df.columns:
|
54 |
+
train_df["DateTime"] = pd.to_datetime(train_df["DateTime"])
|
55 |
+
train_df["year"] = train_df["DateTime"].dt.year
|
56 |
+
train_df["month"] = train_df["DateTime"].dt.month
|
57 |
+
train_df["day"] = train_df["DateTime"].dt.day
|
58 |
+
train_df["hour"] = train_df["DateTime"].dt.hour
|
59 |
+
train_df["minute"] = train_df["DateTime"].dt.minute
|
60 |
+
train_df["weekday"] = train_df["DateTime"].dt.weekday
|
61 |
+
train_df.drop("DateTime", axis=1, inplace=True)
|
62 |
+
|
63 |
+
if "DateTime" in test_df.columns:
|
64 |
+
test_df["DateTime"] = pd.to_datetime(test_df["DateTime"])
|
65 |
+
test_df["year"] = test_df["DateTime"].dt.year
|
66 |
+
test_df["month"] = test_df["DateTime"].dt.month
|
67 |
+
test_df["day"] = test_df["DateTime"].dt.day
|
68 |
+
test_df["hour"] = test_df["DateTime"].dt.hour
|
69 |
+
test_df["minute"] = test_df["DateTime"].dt.minute
|
70 |
+
test_df["weekday"] = test_df["DateTime"].dt.weekday
|
71 |
+
test_df.drop("DateTime", axis=1, inplace=True)
|
72 |
+
|
73 |
# Fill missing values
|
74 |
train_df.fillna(-1, inplace=True)
|
75 |
test_df.fillna(-1, inplace=True)
|
|
|
139 |
|
140 |
numerical_columns = [col for col in FEATURE_COLUMNS + AGGREGATED_COLUMNS if col not in categorical_columns]
|
141 |
|
142 |
+
# scaler = StandardScaler()
|
143 |
+
# df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
|
144 |
+
# test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])
|
145 |
+
|
146 |
|
147 |
+
return df, test_df, label_encoders,# scaler
|
148 |
|
149 |
|
150 |
# ===========================
|
|
|
154 |
def split_and_balance_data(df, target_column):
|
155 |
"""Splits data into training and validation sets, applies SMOTE to balance classes."""
|
156 |
|
157 |
+
X = df[IDS_COLUMNS + FEATURE_COLUMNS + AGGREGATED_COLUMNS + TEMPORAL_COLUMNS]
|
158 |
y = df[target_column]
|
159 |
|
160 |
# Handle class imbalance using SMOTE
|
|
|
204 |
|
205 |
df, test_df = load_data()
|
206 |
df, test_df = add_aggregated_features(df, test_df)
|
207 |
+
df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
|
208 |
X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
|
209 |
|
210 |
return X_train, X_val, y_train, y_val, test_df
|
|
|
214 |
print("🔹 Loading and processing data...")
|
215 |
X_train, X_val, y_train, y_val, test_df = load_and_process_data()
|
216 |
print("✅ Data successfully loaded and processed!")
|
217 |
+
|