caug37 commited on
Commit
d3c92ef
·
verified ·
1 Parent(s): 0fba120

Upload 41 files

Browse files
Files changed (41) hide show
  1. fine_tune_joyce.py +47 -0
  2. finn_wake.csv +0 -0
  3. finn_wake.txt +0 -0
  4. finn_wake_dataset/cache-07333d095ed3feb5.arrow +3 -0
  5. finn_wake_dataset/cache-08eaee58c5a946a5.arrow +3 -0
  6. finn_wake_dataset/cache-108bb49fa40e19cc.arrow +3 -0
  7. finn_wake_dataset/cache-21a914d68d7decd3.arrow +3 -0
  8. finn_wake_dataset/cache-21efcd3d18f14eb8.arrow +3 -0
  9. finn_wake_dataset/cache-25107278423e0b27.arrow +3 -0
  10. finn_wake_dataset/cache-294de0f58e07010c.arrow +3 -0
  11. finn_wake_dataset/cache-3d5c944f43725d69.arrow +3 -0
  12. finn_wake_dataset/cache-40ed1615346dc08d.arrow +3 -0
  13. finn_wake_dataset/cache-4160687fb3073882.arrow +3 -0
  14. finn_wake_dataset/cache-4328f755c07fa7f2.arrow +3 -0
  15. finn_wake_dataset/cache-461df009004232f3.arrow +3 -0
  16. finn_wake_dataset/cache-4a71b21bdeaa2ec7.arrow +3 -0
  17. finn_wake_dataset/cache-54251f52f75d36f0.arrow +3 -0
  18. finn_wake_dataset/cache-5929bf28441deefd.arrow +3 -0
  19. finn_wake_dataset/cache-5be06efb6c2a7249.arrow +3 -0
  20. finn_wake_dataset/cache-84a914c4d2cc8d93.arrow +3 -0
  21. finn_wake_dataset/cache-8cc3936385189eb8.arrow +3 -0
  22. finn_wake_dataset/cache-8f8de0a505da9b57.arrow +3 -0
  23. finn_wake_dataset/cache-93f1332b83d2a02e.arrow +3 -0
  24. finn_wake_dataset/cache-a463b77c9c10fcdd.arrow +3 -0
  25. finn_wake_dataset/cache-b41c1b1ef780b910.arrow +3 -0
  26. finn_wake_dataset/cache-ba9518a20dfb972e.arrow +3 -0
  27. finn_wake_dataset/cache-dda16c8e4395755d.arrow +3 -0
  28. finn_wake_dataset/cache-e1c4ec6e99052dfc.arrow +3 -0
  29. finn_wake_dataset/cache-e6f91d075c0b5063.arrow +3 -0
  30. finn_wake_dataset/cache-e79c82020953bbef.arrow +3 -0
  31. finn_wake_dataset/cache-f135f63843848cc4.arrow +3 -0
  32. finn_wake_dataset/cache-f39a662b6194c6e4.arrow +3 -0
  33. finn_wake_dataset/cache-f7fe086af672c971.arrow +3 -0
  34. finn_wake_dataset/cache-fb842fe1846f33ac.arrow +3 -0
  35. finn_wake_dataset/cache-fda59bdb198b3ef4.arrow +3 -0
  36. finn_wake_dataset/cache-fefdfd5378240940.arrow +3 -0
  37. finn_wake_dataset/data-00000-of-00001.arrow +3 -0
  38. finn_wake_dataset/dataset_info.json +12 -0
  39. finn_wake_dataset/state.json +13 -0
  40. process_wake.py +23 -0
  41. text_gen.py +27 -0
fine_tune_joyce.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset, load_from_disk
3
+
4
+ dataset = load_from_disk('finn_wake_dataset')
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
7
+
8
+ tokenizer.save_pretrained(".results/checkpoint-12000/")
9
+
10
+ model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
11
+
12
+ if tokenizer.pad_token is None:
13
+ print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.")
14
+ tokenizer.pad_token = tokenizer.eos_token
15
+
16
+ def tokenize_function(examples):
17
+
18
+ tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
19
+ tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
20
+
21
+ return tokenized_inputs
22
+
23
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
24
+ train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
25
+
26
+ train_dataset = train_test_split['train']
27
+ eval_dataset = train_test_split['test']
28
+ training_args = TrainingArguments(
29
+ output_dir="./results",
30
+ num_train_epochs=3,
31
+ per_device_train_batch_size=1,
32
+ warmup_steps=500,
33
+ weight_decay=0.01,
34
+ logging_dir="./logs",
35
+ logging_steps=10,
36
+ save_strategy="steps",
37
+ save_steps=500,
38
+ save_total_limit=2,
39
+ use_cpu=True)
40
+ trainer = Trainer(
41
+ model=model,
42
+ args=training_args,
43
+ train_dataset=train_dataset,
44
+ eval_dataset=eval_dataset,
45
+ )
46
+ #below has been modified because i ran out of disk storage initially so had to resume and adjust the save_strategy above.
47
+ trainer.train(resume_from_checkpoint="./results/checkpoint-10000")
finn_wake.csv ADDED
The diff for this file is too large to render. See raw diff
 
finn_wake.txt ADDED
The diff for this file is too large to render. See raw diff
 
finn_wake_dataset/cache-07333d095ed3feb5.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7799f52711777e954892b9143657a98eb4ad4412c8eb9ec8d51cf9f2d29d0960
3
+ size 33738392
finn_wake_dataset/cache-08eaee58c5a946a5.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57d59e9370cb6130a4315fa2e751c32aab502b2f9bf80f25cb52e243364f6915
3
+ size 23368
finn_wake_dataset/cache-108bb49fa40e19cc.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fee0801181eb75905ceec32c2100b5e957535fec236e46f6268fb204b9a87f2d
3
+ size 23368
finn_wake_dataset/cache-21a914d68d7decd3.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4db4f65eb28996e319333d5b010d0881b9a6095fc04f8fd9c6eba422037813d5
3
+ size 23368
finn_wake_dataset/cache-21efcd3d18f14eb8.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:751aada291665431650955366b4430502c33b9b2c2aaf7376eaf89abb08e9621
3
+ size 83745984
finn_wake_dataset/cache-25107278423e0b27.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9866ffc0fbbc4c8bc780314aa2fcc9dabf43df74f71d3944b67f37ab0e794158
3
+ size 2968
finn_wake_dataset/cache-294de0f58e07010c.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4af4228221d06dc079633bac9bf6f84af52752e3b569e99b5da1a4283e1c8ce4
3
+ size 33832
finn_wake_dataset/cache-3d5c944f43725d69.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ac4e76ce5d8ba937fd8ab9db955eacb62c1d2705c5a5b9765253687549e3e33
3
+ size 2968
finn_wake_dataset/cache-40ed1615346dc08d.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f10c74d395f429b9b9795271ac556ede1daf8b8c05f04986d7f54fb4aea0688
3
+ size 23368
finn_wake_dataset/cache-4160687fb3073882.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ad2214bd7e550cec284ffaa766b916a076a0e75fe377ce855634dbfeea467ae
3
+ size 23368
finn_wake_dataset/cache-4328f755c07fa7f2.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549c73a76cd629ad9e36435289c0354fbd8c44dc91edcd3a9e3a2c8efa604e67
3
+ size 33738392
finn_wake_dataset/cache-461df009004232f3.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d214cac2a17f55672d3c7a182a25beb09ac4a6232f91518449bf64b4c36e4aa6
3
+ size 33832
finn_wake_dataset/cache-4a71b21bdeaa2ec7.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ec3d6cde0807e531ecf7b92decd38a27a3ace2c04f2190721abd9d4968d9255
3
+ size 2968
finn_wake_dataset/cache-54251f52f75d36f0.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca4cd45ca17b5d83c8c97bea2fab7ceef28b99931062c92cb1e19bbf02c63415
3
+ size 4096
finn_wake_dataset/cache-5929bf28441deefd.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:801a239d436b573afd7561832db31eaf9cc40c878a4bc6aeccbdf9913df699e9
3
+ size 23368
finn_wake_dataset/cache-5be06efb6c2a7249.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a992fa9985e1e353bb8ade640fbfff72e2e75db5a2a72d8bec3ba5a258ba99c
3
+ size 33832
finn_wake_dataset/cache-84a914c4d2cc8d93.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad6c49e3bdb4f56e85c5f62077186a62d17646acc4d8fc1440cfd4317e1278d9
3
+ size 2968
finn_wake_dataset/cache-8cc3936385189eb8.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b077a811daba8e024ca0e53fcc08565f1f2d676e586f39b25f88881b45adc60
3
+ size 4096
finn_wake_dataset/cache-8f8de0a505da9b57.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ace69e62138b3125ae070c00efafd2193e23e6cf52199a49523852169bca17d
3
+ size 2968
finn_wake_dataset/cache-93f1332b83d2a02e.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a90517adc17bac71edd4db47e7fe7780687ba2a70711111cd43fd69a6592fe9
3
+ size 4096
finn_wake_dataset/cache-a463b77c9c10fcdd.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b236cc0e50d9affe1746fc1530b3e99a9d572acce641e26d0c5ef4cb164af744
3
+ size 2968
finn_wake_dataset/cache-b41c1b1ef780b910.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e26f42fdfc293b91e57c84890b675de30bbdb88238775d0addbbaf488e87335
3
+ size 2968
finn_wake_dataset/cache-ba9518a20dfb972e.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3967fa682ccf74313b48cab8a799cde75a4dd48f7324cdb0361feef6d42d12eb
3
+ size 23368
finn_wake_dataset/cache-dda16c8e4395755d.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7a51ecd1cb6d50858465c20d310f58a8ca6428dc041313d4c84bf7ba333c74
3
+ size 23368
finn_wake_dataset/cache-e1c4ec6e99052dfc.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:462b68fef59b237820edd0a402230fb1e3953f1e3572977cf15b2e0ad46400a3
3
+ size 5271744
finn_wake_dataset/cache-e6f91d075c0b5063.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb7be6b6fbca4557816f0dd00b3ac3fcfbbe0a08a2c93b851455ae642b2e116a
3
+ size 7640328
finn_wake_dataset/cache-e79c82020953bbef.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04731bd4add6cb26e32f9e697f5764f421f64e62e774b13371743ed48327f4c0
3
+ size 23368
finn_wake_dataset/cache-f135f63843848cc4.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a87a79fea2a4dac7a8607f5aab278894807282427da346899de8d52b132100
3
+ size 2968
finn_wake_dataset/cache-f39a662b6194c6e4.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a591977c03ca3bde5d1916489df376e89bcce387d660394adfaf90448d5bc4a
3
+ size 2968
finn_wake_dataset/cache-f7fe086af672c971.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36a242fdf45fcea8a806a8df21299049d99a2c6a3489a9a6ddfdab82626df1ef
3
+ size 23368
finn_wake_dataset/cache-fb842fe1846f33ac.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb719489a9a6868c7f28d4e237e1d7ad516b74333fc2adff4dea6010d26e381b
3
+ size 23368
finn_wake_dataset/cache-fda59bdb198b3ef4.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9bd80aa28367576aaa834a272d4a192d554d2fc3f70914d1b53db9851f04231
3
+ size 2968
finn_wake_dataset/cache-fefdfd5378240940.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:867fbcea23ec00b23b23e36e1522a1795d6aa179f27d098fae2973fe3b3ba38c
3
+ size 2968
finn_wake_dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3684e77cfcc0fdcc1a627db2f5caa5f306f61f70ee4d2434f205293fc491037
3
+ size 1321944
finn_wake_dataset/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
finn_wake_dataset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "953e00a2598c0e70",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
process_wake.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ def is_page_number(line):
3
+ return line.strip().isdigit()
4
+ with open("./finn_wake.txt", "r", encoding="utf-8") as file:
5
+ lines = file.readlines()
6
+ filtered_lines = [line for line in lines if not is_page_number(line)]
7
+ text = ''.join(filtered_lines)
8
+ from datasets import Dataset
9
+ import pandas as pd
10
+ def split_paragraph_into_smaller_parts(paragraph, max_length=100):
11
+ """Split a paragraph into smaller parts with a maximum length in words."""
12
+ words = paragraph.split()
13
+ for i in range(0, len(words), max_length):
14
+ yield ' '.join(words[i:i+max_length])
15
+ paragraphs = text.split('\n')
16
+ split_paragraphs = []
17
+ for paragraph in paragraphs:
18
+ if paragraph.strip() != "":
19
+ split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100))
20
+ df = pd.DataFrame(split_paragraphs, columns=['text'])
21
+ dataset = Dataset.from_pandas(df)
22
+ df.to_csv('finn_wake.csv', index=False)
23
+ dataset.save_to_disk('finn_wake_dataset')
text_gen.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
5
+ from datasets import load_dataset, load_from_disk
6
+
7
+ #post training
8
+ model_path = "./results/checkpoint-12000"
9
+ model = AutoModelForCausalLM.from_pretrained(model_path)
10
+ tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
11
+
12
+
13
+ input_text = "ae left to go to ireland and found a fairy"
14
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
15
+ output = model.generate(
16
+ input_ids=tokenizer.encode(input_text, return_tensors="pt"),
17
+ max_length=400,
18
+ num_return_sequences=1,
19
+ temperature=0.7,
20
+ top_k=50,
21
+ top_p=0.95,
22
+ do_sample=True,
23
+ num_beams=5
24
+ )
25
+
26
+ decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
27
+ print(decoded_output)