sujitvasanth commited on
Commit
91dd7fc
·
verified ·
1 Parent(s): 5fd42e9

Upload 2 files

Browse files
quatisation_files/prepare_cal_data.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prepare_cal_data.py (Final version)
2
+ from datasets import load_dataset
3
+ import pandas as pd
4
+ import os
5
+
6
+ os.environ['HF_HOME'] = os.path.join(os.path.expanduser('~'), '.cache', 'huggingface')
7
+
8
+ print("Downloading and preparing a LARGER wikitext calibration dataset...")
9
+
10
+ # Load 10,000 samples instead of 1,000 to ensure enough tokens
11
+ dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train", streaming=True).take(10000)
12
+
13
+ output_parquet_file = "wikitext_cal_data.parquet"
14
+
15
+ data_list = []
16
+ for item in dataset:
17
+ text = item['text'].strip()
18
+ if text:
19
+ data_list.append(text)
20
+
21
+ df = pd.DataFrame(data_list, columns=['text'])
22
+ df.to_parquet(output_parquet_file, engine='fastparquet')
23
+
24
+ print(f"'{output_parquet_file}' created successfully with more data.")
quatisation_files/wikitext_cal_data.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5e601fae97898f0473516b69910c4fac839d059cdcaf2e461cafe14d5c6f341
3
+ size 1692177