Spaces:

mgyigit
/

misinfo

Sleeping

App Files Files Community

misinfo / src /preprocess /preprocess.py

gyigit

update

54e8a79 3 months ago

raw

history blame

2.41 kB

	import pandas as pd

	from src.utils.data_utils import HEADERS
	from src.utils.path_utils import get_project_root

	# Constants
	PROJECT_ROOT = get_project_root()
	RAW_DIR = PROJECT_ROOT / "data/raw/factify"
	PROCESSED_DIR = PROJECT_ROOT / "data/preprocessed"
	IMAGES_DIR = RAW_DIR / "extracted/images"


	def ensure_directories():
	"""Ensure that necessary directories exist."""
	PROCESSED_DIR.mkdir(parents=True, exist_ok=True) # Create 'data/preprocessed'


	def preprocess_csv(dataset: str):
	"""
	Preprocess the given dataset CSV (train or test).

	Args:
	dataset (str): The dataset name ('train' or 'test').
	"""
	# Paths
	ensure_directories()

	csv_path = RAW_DIR / f"extracted/{dataset}.csv"
	processed_csv_path = PROCESSED_DIR / f"{dataset}.csv"
	images_folder = IMAGES_DIR / dataset

	if not csv_path.exists():
	print(f"Dataset CSV not found: {csv_path}")
	return

	# Load the CSV
	df = pd.read_csv(csv_path, names=HEADERS, header=None, sep="\t", skiprows=1)

	# Update file paths for images
	def update_image_path(row, column_name):
	"""Update the image path if it exists, else leave as None."""
	image_file = row[column_name]
	file_id = row["id"]
	if column_name == "claim_image_original":
	file_path = images_folder / f"{file_id}_claim.jpg"
	elif column_name == "evidence_image_original":
	file_path = images_folder / f"{file_id}_evidence.jpg"
	else:
	return None

	# Check if the file exists
	if file_path.exists():
	# Use the relative path starting from "/data/.."
	return str(file_path.relative_to(PROJECT_ROOT))
	return None

	df.rename(
	columns={
	"claim_image": "claim_image_original",
	"evidence_image": "evidence_image_original",
	},
	inplace=True,
	)
	df["claim_image"] = df.apply(
	lambda row: update_image_path(row, "claim_image_original"), axis=1
	)
	df["evidence_image"] = df.apply(
	lambda row: update_image_path(row, "evidence_image_original"), axis=1
	)

	# Save the processed CSV
	df.to_csv(processed_csv_path, index=False)
	print(f"Processed {dataset}.csv saved to {processed_csv_path}")


	def main():
	for dataset in ["train", "test"]:
	preprocess_csv(dataset)


	if __name__ == "__main__":
	main()