Spaces:

mgyigit
/

misinfo

Sleeping

File size: 3,370 Bytes

54e8a79

import os
import shutil

from src.utils.path_utils import get_project_root


def separate_evidence_images(base_dir):
    """
    Separates evidence images from the train directory and copies them into a new 'evidence_corpus' folder.

    Args:
        base_dir (str): The base directory containing the 'train' folder.
    """
    # Define paths
    datasets = ["train", "test"]
    evidence_corpus_dir = os.path.join(base_dir, "evidence_corpus")

    # Create the evidence_corpus directory if it doesn't exist
    os.makedirs(evidence_corpus_dir, exist_ok=True)

    # Loop through the train directory and copy evidence images
    for dataset in datasets:
        dataset_dir = os.path.join(base_dir, dataset)
        for filename in os.listdir(dataset_dir):
            if filename.split("_")[-1].split(".")[0] == "evidence":
                new_filename = f"{dataset}_{filename}"
                source_path = os.path.join(dataset_dir, filename)
                target_path = os.path.join(evidence_corpus_dir, new_filename)

                shutil.copy(source_path, target_path)

    print("All evidence images in the train set have been copied.")


import pickle

# File path for the evidence features pickle
pickle_file_path = "evidence_features.pkl"


# Function to update the keys in the pickle
def update_pickle_keys(pickle_file_path, output_pickle_path=None):
    # Open and load the existing pickle
    with open(pickle_file_path, "rb") as f:
        feature_dict = pickle.load(f)

    updated_dict = {}

    # Update each key
    for old_path, features in feature_dict.items():
        # Extract the filename (e.g., test_0_evidence.jpg)
        filename = os.path.basename(old_path)

        # Determine if it's a test or train image based on the filename
        if filename.startswith("test"):
            new_relative_path = os.path.join(
                "data",
                "raw",
                "factify",
                "extracted",
                "images",
                "test",
                filename.split("_", 1)[1],
            )
        elif filename.startswith("train"):
            new_relative_path = os.path.join(
                "data",
                "raw",
                "factify",
                "extracted",
                "images",
                "train",
                filename.split("_", 1)[1],
            )
        else:
            raise ValueError(f"Unexpected filename format: {filename}")

        # Add the updated key and its value to the new dictionary
        updated_dict[new_relative_path] = features

    # Save the updated dictionary back to a pickle file
    output_path = output_pickle_path if output_pickle_path else pickle_file_path
    with open(output_path, "wb") as f:
        pickle.dump(updated_dict, f)

    print(f"Updated pickle saved at: {output_path}")


# Example usage
if __name__ == "__main__":
    pickle_file_path = "/evidence_features.pkl"
    project_root = get_project_root()
    # Run the function
    base_dir = os.path.join(
        project_root, "data", "raw", "factify", "extracted", "images"
    )
    separate_evidence_images(base_dir)

    # out_pkl_path = "C:\\Users\\defne\\Desktop\\2024-2025FallSemester\\Applied NLP\\multimodal-misinformation-detection\\data\\raw\\factify\\extracted\\images"
    # update_pickle_keys(pickle_file_path, output_pickle_path=out_pkl_path)