unit_1_quiz

Running on CPU Upgrade

unit_1_quiz / data_to_parquet.py

Upload folder using huggingface_hub

f04e8bb verified about 1 month ago

1.71 kB

	import pyarrow as pa
	import pyarrow.parquet as pq
	import json
	import tempfile


	# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
	schema = {
	"username": {"_type": "Value", "dtype": "string"},
	"unit1": {"_type": "Value", "dtype": "float64"},
	"unit2": {"_type": "Value", "dtype": "float64"},
	"unit3": {"_type": "Value", "dtype": "float64"},
	"unit4": {"_type": "Value", "dtype": "float64"},
	"certified": {"_type": "Value", "dtype": "int64"},
	}


	def to_parquet(
	api,
	repo: str,
	username: str = "",
	unit1: float = 0.0,
	unit2: float = 0.0,
	unit3: float = 0.0,
	unit4: float = 0.0,
	certified: int = 0,
	):
	data = {
	"username": username,
	"unit1": unit1 * 100 if unit1 != 0 else 0.0,
	"unit2": unit2 * 100 if unit2 != 0 else 0.0,
	"unit3": unit3 * 100 if unit3 != 0 else 0.0,
	"unit4": unit4 * 100 if unit4 != 0 else 0.0,
	"certified": certified,
	}
	# Export data to Arrow format
	table = pa.Table.from_pylist([data])
	# Add metadata (used by datasets library)
	table = table.replace_schema_metadata(
	{"huggingface": json.dumps({"info": {"features": schema}})}
	)
	# Write to parquet file
	archive_file = tempfile.NamedTemporaryFile(delete=False)
	pq.write_table(table, archive_file.name)
	archive_file.close()

	api.upload_file(
	repo_id=repo, # manually created repo
	repo_type="dataset",
	path_in_repo=f"{username}.parquet", # each user will have their own parquet
	path_or_fileobj=archive_file.name,
	)