misinfo / src /utils /data_utils.py
gyigit's picture
update
54e8a79
raw
history blame
2.16 kB
import os
import pandas as pd
from PIL import Image
from typing import Dict, Any
from src.utils.path_utils import get_project_root
# Constants
PROJECT_ROOT = get_project_root()
PREPROCESSED_DIR = PROJECT_ROOT / "data/preprocessed"
HEADERS = [
"id",
"claim",
"claim_image",
"evidence",
"evidence_image",
"category",
"claim_ocr",
"evidence_ocr",
]
def get_preprocessed_data(dataset: str = "train") -> pd.DataFrame:
"""
Load the preprocessed data for the specified dataset.
Args:
dataset (str): Either 'train' or 'test'. Defaults to 'train'.
Returns:
pd.DataFrame: A DataFrame containing the preprocessed data.
"""
csv_path = PREPROCESSED_DIR / f"{dataset}.csv"
if not csv_path.exists():
raise FileNotFoundError(f"Preprocessed dataset CSV not found: {csv_path}")
return pd.read_csv(csv_path)
def load_images_for_row(row: Dict[str, Any]) -> Dict[str, Any]:
"""
Load the claim and evidence images for a given row of data.
Args:
row (Dict[str, Any]): A dictionary representing a row of preprocessed data.
Returns:
Dict[str, Any]: A dictionary containing the original row with loaded images added.
"""
result = row.copy() # Copy the original row to avoid modifying the input
claim_image_path = row.get("claim_image")
evidence_image_path = row.get("evidence_image")
if claim_image_path and os.path.exists(claim_image_path):
try:
result["claim_image"] = Image.open(claim_image_path).convert("RGB")
except Exception as e:
print(f"Failed to load claim image from {claim_image_path}: {e}")
result["claim_image"] = None
else:
result["claim_image"] = None
if evidence_image_path and os.path.exists(evidence_image_path):
try:
result["evidence_image"] = Image.open(evidence_image_path).convert("RGB")
except Exception as e:
print(f"Failed to load evidence image from {evidence_image_path}: {e}")
result["evidence_image"] = None
else:
result["evidence_image"] = None
return result