| import sys | |
| import os | |
| from tqdm import tqdm | |
| sys.path.append('../../') | |
| if __name__ == '__main__': | |
| from data.fs_datasets import load_dataset | |
| dataset = load_dataset('wudao_180g', num_proc=100) | |
| print('dataset loaded', flush=True) | |
| shuffle_ds = dataset['train'].shuffle(seed=42, writer_batch_size=1000) | |
| print('dataset shuffled', flush=True) | |
| need_size = len(shuffle_ds) | |
| f = open('shuffle_corpus_{}.txt'.format(need_size), 'w', encoding='utf-8') | |
| for i in tqdm(range(0, need_size)): | |
| f.write(shuffle_ds[i]['text'] + os.linesep) | |
| f.close() | |