Spaces:
Sleeping
Sleeping
from datasets import load_dataset, Dataset | |
import pandas as pd | |
from pdb import set_trace as st | |
def process_dataset_ultrafeedback(): | |
""" | |
Processes the 'train_prefs' and 'test_prefs' splits of the 'HuggingFaceH4/ultrafeedback_binarized' dataset | |
into a unified format for preference modeling. | |
Returns: | |
dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format. | |
Each split is a Hugging Face Dataset object. | |
""" | |
# Load the relevant splits of the dataset | |
dataset_name = "HuggingFaceH4/ultrafeedback_binarized" | |
train_prefs = load_dataset(dataset_name, split="train_prefs") | |
test_prefs = load_dataset(dataset_name, split="test_prefs") | |
# Function to transform a single example into the desired schema | |
def transform_data(example): | |
data_points = [] | |
# Chosen completion | |
chosen_completion = example["chosen"][1]["content"] | |
if chosen_completion.strip(): # Check for non-empty completions | |
data_points.append({ | |
"prompt": example["prompt"], | |
"completion": chosen_completion.strip(), | |
"label": True | |
}) | |
# Rejected completion | |
rejected_completion = example["rejected"][1]["content"] | |
if rejected_completion.strip(): # Check for non-empty completions | |
data_points.append({ | |
"prompt": example["prompt"], | |
"completion": rejected_completion.strip(), | |
"label": False | |
}) | |
return data_points | |
# Process train and test splits | |
train_data = [] | |
test_data = [] | |
for example in train_prefs: | |
train_data.extend(transform_data(example)) | |
for example in test_prefs: | |
test_data.extend(transform_data(example)) | |
# Convert unified data to DataFrames | |
train_df = pd.DataFrame(train_data) | |
test_df = pd.DataFrame(test_data) | |
# Convert to Hugging Face Dataset | |
unified_train = Dataset.from_pandas(train_df) | |
unified_test = Dataset.from_pandas(test_df) | |
return {"train": unified_train, "test": unified_test} | |
if __name__ == "__main__": | |
kto_dataset = process_dataset_ultrafeedback() | |
st() | |