|
import os |
|
import pandas as pd |
|
from PIL import Image |
|
from typing import Dict, Any |
|
from src.utils.path_utils import get_project_root |
|
|
|
|
|
PROJECT_ROOT = get_project_root() |
|
PREPROCESSED_DIR = PROJECT_ROOT / "data/preprocessed" |
|
|
|
HEADERS = [ |
|
"id", |
|
"claim", |
|
"claim_image", |
|
"evidence", |
|
"evidence_image", |
|
"category", |
|
"claim_ocr", |
|
"evidence_ocr", |
|
] |
|
|
|
|
|
def get_preprocessed_data(dataset: str = "train") -> pd.DataFrame: |
|
""" |
|
Load the preprocessed data for the specified dataset. |
|
|
|
Args: |
|
dataset (str): Either 'train' or 'test'. Defaults to 'train'. |
|
|
|
Returns: |
|
pd.DataFrame: A DataFrame containing the preprocessed data. |
|
""" |
|
csv_path = PREPROCESSED_DIR / f"{dataset}.csv" |
|
|
|
if not csv_path.exists(): |
|
raise FileNotFoundError(f"Preprocessed dataset CSV not found: {csv_path}") |
|
|
|
return pd.read_csv(csv_path) |
|
|
|
|
|
def load_images_for_row(row: Dict[str, Any]) -> Dict[str, Any]: |
|
""" |
|
Load the claim and evidence images for a given row of data. |
|
|
|
Args: |
|
row (Dict[str, Any]): A dictionary representing a row of preprocessed data. |
|
|
|
Returns: |
|
Dict[str, Any]: A dictionary containing the original row with loaded images added. |
|
""" |
|
result = row.copy() |
|
claim_image_path = row.get("claim_image") |
|
evidence_image_path = row.get("evidence_image") |
|
|
|
if claim_image_path and os.path.exists(claim_image_path): |
|
try: |
|
result["claim_image"] = Image.open(claim_image_path).convert("RGB") |
|
except Exception as e: |
|
print(f"Failed to load claim image from {claim_image_path}: {e}") |
|
result["claim_image"] = None |
|
else: |
|
result["claim_image"] = None |
|
|
|
if evidence_image_path and os.path.exists(evidence_image_path): |
|
try: |
|
result["evidence_image"] = Image.open(evidence_image_path).convert("RGB") |
|
except Exception as e: |
|
print(f"Failed to load evidence image from {evidence_image_path}: {e}") |
|
result["evidence_image"] = None |
|
else: |
|
result["evidence_image"] = None |
|
|
|
return result |
|
|