caug37 commited on
Commit
c6bb4cd
·
verified ·
1 Parent(s): 08caf02

Upload 25 files

Browse files
.gitattributes CHANGED
@@ -34,3 +34,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  model.gguf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  model.gguf filter=lfs diff=lfs merge=lfs -text
37
+ bentham_text.txt filter=lfs diff=lfs merge=lfs -text
38
+ BenthamVol1.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
39
+ BenthamVol10.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
40
+ BenthamVol11.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
41
+ BenthamVol2.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
42
+ BenthamVol3.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
43
+ BenthamVol4.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
44
+ BenthamVol5.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
45
+ BenthamVol6.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
46
+ BenthamVol7.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
47
+ BenthamVol8.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
48
+ BenthamVol9.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
BenthamVol1.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c6fe2e819284fbe21628faae4a7570515da7d2a7aa4a04421d0e3573fb68d82
3
+ size 4623992
BenthamVol10.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:401e0be610adc3c49feb971639263c1ab9fc2c79af5050cb4d76cb988cfbc86d
3
+ size 3421267
BenthamVol11.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c209df71f83dcef64804792358da23beef725366fb098e72f8ac9631f06d00
3
+ size 1045494
BenthamVol2.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d661fde64870b59cc00e0ecc049585b612b2bc96d5845ef60120c401c87204b
3
+ size 3756476
BenthamVol3.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b39d69c5e4848bbec153e74291c723d73e5be07e3ca9efdd1dd04285c1afeb9
3
+ size 4325352
BenthamVol4.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be5974483aa1621f647578b3eac9140d7684818e94a258e7cd37961a340d4d90
3
+ size 4533915
BenthamVol5.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b99a29acca810e322c6186e5068ab2fc643293e7f4a784e2668cc6ae2206a1d5
3
+ size 4506297
BenthamVol6.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c5d377a1ce60f89dbc47d27551baeacd1a0a402e4c9ad157e0fa8e5fa5ac101
3
+ size 3794279
BenthamVol7.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6bac174be0b15b26587a39472ae63f84f638d0f974ce5dd76791f36fb6be4f2
3
+ size 4135943
BenthamVol8.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5e5f862eed5960a4908b09692d617f7fedacb0e0da4b2bc84a884b76a9c4c0f
3
+ size 4983726
BenthamVol9.LibertyFund.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d45fb184f6ed0678d923e876e15497bed5997f93aabe8686f05b70e57a9f506c
3
+ size 4814191
bentham_chunked/cache-5cbdbfe773332528.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b371ef80caf63f5aea26ab76e2ecd79453d826f65aa9bd42825bc87af742dd26
3
+ size 188607104
bentham_chunked/cache-6dab9c48ee95e985.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa7b88d9c574aac8d5503bdc9863198ac910e2735d9c1333e8fbaa61f08e0c06
3
+ size 46344
bentham_chunked/cache-7a28cb91cfcdeb71.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0704409c709d1499ee3670c941af0263ae5eb383b3fd6455468cf65e7e0d0b3
3
+ size 414168
bentham_chunked/cache-93db310462e229c3.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25b8ba600c274704362f1a69ce9c709ec2803649a92c1b2ef128e19000db6294
3
+ size 94654336
bentham_chunked/cache-d9fcbe649ebfa850.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4557dfea9c2539d8b2102051c2f5bb6c0de473be66e27aea5ff627514a5abf13
3
+ size 414168
bentham_chunked/cache-dd243b4e82321451.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6acc5eb21c051c097de2d2d8aa627651b635ffb95ced31ca88f8212ee09a6a
3
+ size 46344
bentham_chunked/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b92bd87e39aded15d180303639f11fdf680bdbcd5ffa832e2bc9c5371e1e43e0
3
+ size 33486400
bentham_chunked/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
bentham_chunked/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "cb9daa9a8124efb5",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
bentham_text.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd528752a5520fb4dee602649217bf832484158cedba9faae8a11fd52c6e744
3
+ size 35431921
bentham_text_sm.txt ADDED
The diff for this file is too large to render. See raw diff
 
finetune_bentham.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset, load_from_disk
3
+
4
+ dataset = load_from_disk('bentham_chunked')
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
7
+ #tokenizer.save_pretrained("results/tokenizer")
8
+
9
+
10
+ model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
11
+
12
+ if tokenizer.pad_token is None:
13
+ print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.")
14
+ tokenizer.pad_token = tokenizer.eos_token
15
+
16
+ def tokenize_function(examples):
17
+
18
+ tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)
19
+ tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
20
+
21
+ return tokenized_inputs
22
+
23
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
24
+ train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
25
+
26
+ train_dataset = train_test_split['train']
27
+ eval_dataset = train_test_split['test']
28
+ training_args = TrainingArguments(
29
+ output_dir="./results",
30
+ num_train_epochs=3,
31
+ per_device_train_batch_size=1,
32
+ warmup_steps=500,
33
+ weight_decay=0.01,
34
+ logging_dir="./logs",
35
+ logging_steps=10,
36
+ save_strategy="steps",
37
+ save_steps=500,
38
+ save_total_limit=2,
39
+ use_cpu=True)
40
+ trainer = Trainer(
41
+ model=model,
42
+ args=training_args,
43
+ train_dataset=train_dataset,
44
+ eval_dataset=eval_dataset,
45
+ )
46
+ #below has been modified because i ran out of disk storage initially so had to resume and adjust the
47
+ trainer.train()#resume_from_checkpoint="./results/checkpoint-10000")
48
+
49
+ #tokenizer.save_pretrained(".results/")
pdf_extract.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfminer.high_level import extract_text
2
+
3
+ import re
4
+ bentham_texts = []
5
+ import glob
6
+ def extract_text_from_pdf(pdf_path):
7
+ text = extract_text(pdf_path)
8
+ return text
9
+
10
+ bentham_pdfs = glob.glob('./Bentham*.pdf')
11
+ for pdf in bentham_pdfs:
12
+ print(pdf)
13
+ with open(pdf, 'rb') as f:
14
+ text = extract_text_from_pdf(f)
15
+ bentham_texts.append(text)
16
+
17
+
18
+
19
+ bentham_text_string = ' '.join(bentham_texts)
20
+ with open('bentham_text.txt', 'w') as f:
21
+ f.write(bentham_text_string)
22
+
23
+
24
+
25
+
26
+ ##
27
+ bentham_text_string =''
28
+ with open('bentham_text.txt', 'r') as f:
29
+ bentham_text_strings = f.readlines()
30
+ bentham_text_string = ''.join(bentham_text_strings)
31
+
32
+ import re
33
+
34
+ def clean_text(text):
35
+ cleaned_text = re.sub(r'§\s*\d+\.', '', text)
36
+
37
+ # Step 2: Remove the unwanted patterns
38
+ # Removing patterns like "PLL v6.0 (generated September, 2011)" and URLs
39
+ cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
40
+ cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL)
41
+ cleaned_text = re.sub(r'https?://\S+', '', cleaned_text)
42
+
43
+ # Removing "Online Library of Liberty" lines that might not fit the exact previous pattern
44
+ cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
45
+ cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL)
46
+ cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text) # Removes non-printable characters
47
+ cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text) # Removes escaped special characters like \n, \t, \r
48
+ patterns_to_remove = [
49
+ r'^\s*$', # Empty lines
50
+ r'^\s*\d+\s*$', # Standalone numeric lines
51
+ r'\[Back to Table of Contents\]', # Specific placeholders
52
+ ]
53
+ for pattern in patterns_to_remove:
54
+ cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE)
55
+ return cleaned_text
56
+ cleaned_lines = []
57
+ for line in bentham_text_strings:
58
+ cleaned_line = clean_text(line)
59
+ if cleaned_line != '':
60
+ cleaned_lines.append(cleaned_line)
61
+
62
+
63
+ def split_into_chunks(text, chunk_size=100):
64
+ """
65
+ Split the text into chunks of approximately `chunk_size` words.
66
+
67
+ Args:
68
+ text (str): The input text to split.
69
+ chunk_size (int): The desired chunk size in words.
70
+
71
+ Returns:
72
+ list of str: A list of text chunks.
73
+ """
74
+ # Split the text into words
75
+ words = text.split()
76
+
77
+ # Create chunks of approximately `chunk_size` words
78
+ chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
79
+
80
+ return chunks
81
+
82
+ chunks = split_into_chunks((' ').join(cleaned_lines), 100)
83
+
84
+ from datasets import Dataset
85
+
86
+ # Assuming `chunks` is the list of text chunks you created earlier
87
+ data = {'text': chunks}
88
+ new_dataset = Dataset.from_dict(data)
89
+ new_dataset.save_to_disk('./bentham_chunked')
90
+
text_gen.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
5
+ from datasets import load_dataset, load_from_disk
6
+
7
+ #post training
8
+ model_path = "./results/checkpoint-152000"
9
+ model = AutoModelForCausalLM.from_pretrained(model_path)
10
+ tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
11
+
12
+
13
+ input_text = """
14
+ H: After all that we have gone through, the truth is written literally and not literately. i have gazed navally and looked to the stars above.
15
+ How would you consider the case of man today amidst all this chaotica?
16
+
17
+ B:
18
+ """
19
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
20
+ output = model.generate(
21
+ input_ids=tokenizer.encode(input_text, return_tensors="pt"),
22
+ max_length=1000,
23
+ num_return_sequences=1,
24
+ no_repeat_ngram_size=5,
25
+ temperature=0.9,
26
+ top_k=50,
27
+ top_p=0.98,
28
+ do_sample=True,
29
+ num_beams=10
30
+ )
31
+
32
+ decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
33
+ print(decoded_output)