Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
# file: compress_datasets.py | |
# time: 19:13 2023/2/5 | |
# author: yangheng <[email protected]> | |
# github: https://github.com/yangheng95 | |
# huggingface: https://huggingface.co/yangheng | |
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en | |
# Copyright (C) 2021. All Rights Reserved. | |
# -*- coding: utf-8 -*- | |
# file: zip_datasets.py | |
# time: 05/11/2022 17:10 | |
# author: yangheng <[email protected]> | |
# github: https://github.com/yangheng95 | |
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en | |
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research | |
# Copyright (C) 2022. All Rights Reserved. | |
import os | |
import shutil | |
import zipfile | |
from pathlib import Path | |
import findfile | |
from pyabsa.utils.pyabsa_utils import fprint | |
def cascade_zip_datasets(): | |
# iterate zip all datasets in the folder | |
datasets = findfile.find_dirs("integrated_datasets", "datasets", recursive=1) | |
for dataset in datasets: | |
if dataset in [ | |
"integrated_datasets", | |
"integrated_datasets.zip", | |
]: | |
continue | |
task_name = Path(dataset).name | |
for d in findfile.find_dirs(dataset, ""): | |
fprint(f"compressing dataset: {d}") | |
dataset_name = Path(d).name | |
zip_file = zipfile.ZipFile( | |
f"integrated_datasets/{task_name}.{dataset_name}.zip".lower(), | |
"w", | |
zipfile.ZIP_DEFLATED, | |
) | |
for root, dirs, files in os.walk(d): | |
for file in files: | |
zip_file.write(os.path.join(root, file).lower()) | |
zip_file.close() | |
if __name__ == "__main__": | |
# if os.path.exists('integrated_datasets'): | |
# try: | |
# shutil.rmtree('integrated_datasets') | |
# except: | |
# os.system('rm -rf integrated_datasets') | |
# | |
# from pyabsa import download_all_available_datasets | |
# | |
# download_all_available_datasets() | |
cascade_zip_datasets() | |