|
import pandas as pd |
|
|
|
from src.utils.data_utils import HEADERS |
|
from src.utils.path_utils import get_project_root |
|
|
|
|
|
PROJECT_ROOT = get_project_root() |
|
RAW_DIR = PROJECT_ROOT / "data/raw/factify" |
|
PROCESSED_DIR = PROJECT_ROOT / "data/preprocessed" |
|
IMAGES_DIR = RAW_DIR / "extracted/images" |
|
|
|
|
|
def ensure_directories(): |
|
"""Ensure that necessary directories exist.""" |
|
PROCESSED_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
def preprocess_csv(dataset: str): |
|
""" |
|
Preprocess the given dataset CSV (train or test). |
|
|
|
Args: |
|
dataset (str): The dataset name ('train' or 'test'). |
|
""" |
|
|
|
ensure_directories() |
|
|
|
csv_path = RAW_DIR / f"extracted/{dataset}.csv" |
|
processed_csv_path = PROCESSED_DIR / f"{dataset}.csv" |
|
images_folder = IMAGES_DIR / dataset |
|
|
|
if not csv_path.exists(): |
|
print(f"Dataset CSV not found: {csv_path}") |
|
return |
|
|
|
|
|
df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1) |
|
|
|
|
|
def update_image_path(row, column_name): |
|
"""Update the image path if it exists, else leave as None.""" |
|
image_file = row[column_name] |
|
file_id = row["id"] |
|
if column_name == "claim_image_original": |
|
file_path = images_folder / f"{file_id}_claim.jpg" |
|
elif column_name == "evidence_image_original": |
|
file_path = images_folder / f"{file_id}_evidence.jpg" |
|
else: |
|
return None |
|
|
|
|
|
if file_path.exists(): |
|
|
|
return str(file_path.relative_to(PROJECT_ROOT)) |
|
return None |
|
|
|
df.rename( |
|
columns={ |
|
"claim_image": "claim_image_original", |
|
"evidence_image": "evidence_image_original", |
|
}, |
|
inplace=True, |
|
) |
|
df["claim_image"] = df.apply( |
|
lambda row: update_image_path(row, "claim_image_original"), axis=1 |
|
) |
|
df["evidence_image"] = df.apply( |
|
lambda row: update_image_path(row, "evidence_image_original"), axis=1 |
|
) |
|
|
|
|
|
df.to_csv(processed_csv_path, index=False) |
|
print(f"Processed {dataset}.csv saved to {processed_csv_path}") |
|
|
|
|
|
def main(): |
|
for dataset in ["train", "test"]: |
|
preprocess_csv(dataset) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|