unit_1_quiz / data_to_parquet.py
not-lain's picture
Upload folder using huggingface_hub
f04e8bb verified
raw
history blame
1.71 kB
import pyarrow as pa
import pyarrow.parquet as pq
import json
import tempfile
# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
schema = {
"username": {"_type": "Value", "dtype": "string"},
"unit1": {"_type": "Value", "dtype": "float64"},
"unit2": {"_type": "Value", "dtype": "float64"},
"unit3": {"_type": "Value", "dtype": "float64"},
"unit4": {"_type": "Value", "dtype": "float64"},
"certified": {"_type": "Value", "dtype": "int64"},
}
def to_parquet(
api,
repo: str,
username: str = "",
unit1: float = 0.0,
unit2: float = 0.0,
unit3: float = 0.0,
unit4: float = 0.0,
certified: int = 0,
):
data = {
"username": username,
"unit1": unit1 * 100 if unit1 != 0 else 0.0,
"unit2": unit2 * 100 if unit2 != 0 else 0.0,
"unit3": unit3 * 100 if unit3 != 0 else 0.0,
"unit4": unit4 * 100 if unit4 != 0 else 0.0,
"certified": certified,
}
# Export data to Arrow format
table = pa.Table.from_pylist([data])
# Add metadata (used by datasets library)
table = table.replace_schema_metadata(
{"huggingface": json.dumps({"info": {"features": schema}})}
)
# Write to parquet file
archive_file = tempfile.NamedTemporaryFile(delete=False)
pq.write_table(table, archive_file.name)
archive_file.close()
api.upload_file(
repo_id=repo, # manually created repo
repo_type="dataset",
path_in_repo=f"{username}.parquet", # each user will have their own parquet
path_or_fileobj=archive_file.name,
)