Spaces:

mgyigit
/

misinfo

Sleeping

App Files Files Community

misinfo / src /utils /data_utils.py

gyigit

update

54e8a79 3 months ago

raw

history blame

2.16 kB

	import os
	import pandas as pd
	from PIL import Image
	from typing import Dict, Any
	from src.utils.path_utils import get_project_root

	# Constants
	PROJECT_ROOT = get_project_root()
	PREPROCESSED_DIR = PROJECT_ROOT / "data/preprocessed"

	HEADERS = [
	"id",
	"claim",
	"claim_image",
	"evidence",
	"evidence_image",
	"category",
	"claim_ocr",
	"evidence_ocr",
	]


	def get_preprocessed_data(dataset: str = "train") -> pd.DataFrame:
	"""
	Load the preprocessed data for the specified dataset.

	Args:
	dataset (str): Either 'train' or 'test'. Defaults to 'train'.

	Returns:
	pd.DataFrame: A DataFrame containing the preprocessed data.
	"""
	csv_path = PREPROCESSED_DIR / f"{dataset}.csv"

	if not csv_path.exists():
	raise FileNotFoundError(f"Preprocessed dataset CSV not found: {csv_path}")

	return pd.read_csv(csv_path)


	def load_images_for_row(row: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Load the claim and evidence images for a given row of data.

	Args:
	row (Dict[str, Any]): A dictionary representing a row of preprocessed data.

	Returns:
	Dict[str, Any]: A dictionary containing the original row with loaded images added.
	"""
	result = row.copy() # Copy the original row to avoid modifying the input
	claim_image_path = row.get("claim_image")
	evidence_image_path = row.get("evidence_image")

	if claim_image_path and os.path.exists(claim_image_path):
	try:
	result["claim_image"] = Image.open(claim_image_path).convert("RGB")
	except Exception as e:
	print(f"Failed to load claim image from {claim_image_path}: {e}")
	result["claim_image"] = None
	else:
	result["claim_image"] = None

	if evidence_image_path and os.path.exists(evidence_image_path):
	try:
	result["evidence_image"] = Image.open(evidence_image_path).convert("RGB")
	except Exception as e:
	print(f"Failed to load evidence image from {evidence_image_path}: {e}")
	result["evidence_image"] = None
	else:
	result["evidence_image"] = None

	return result