Upload 25 files
Browse files- .gitattributes +12 -0
- BenthamVol1.LibertyFund.pdf +3 -0
- BenthamVol10.LibertyFund.pdf +3 -0
- BenthamVol11.LibertyFund.pdf +3 -0
- BenthamVol2.LibertyFund.pdf +3 -0
- BenthamVol3.LibertyFund.pdf +3 -0
- BenthamVol4.LibertyFund.pdf +3 -0
- BenthamVol5.LibertyFund.pdf +3 -0
- BenthamVol6.LibertyFund.pdf +3 -0
- BenthamVol7.LibertyFund.pdf +3 -0
- BenthamVol8.LibertyFund.pdf +3 -0
- BenthamVol9.LibertyFund.pdf +3 -0
- bentham_chunked/cache-5cbdbfe773332528.arrow +3 -0
- bentham_chunked/cache-6dab9c48ee95e985.arrow +3 -0
- bentham_chunked/cache-7a28cb91cfcdeb71.arrow +3 -0
- bentham_chunked/cache-93db310462e229c3.arrow +3 -0
- bentham_chunked/cache-d9fcbe649ebfa850.arrow +3 -0
- bentham_chunked/cache-dd243b4e82321451.arrow +3 -0
- bentham_chunked/data-00000-of-00001.arrow +3 -0
- bentham_chunked/dataset_info.json +12 -0
- bentham_chunked/state.json +13 -0
- bentham_text.txt +3 -0
- bentham_text_sm.txt +0 -0
- finetune_bentham.py +49 -0
- pdf_extract.py +90 -0
- text_gen.py +33 -0
.gitattributes
CHANGED
@@ -34,3 +34,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
model.gguf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
model.gguf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
bentham_text.txt filter=lfs diff=lfs merge=lfs -text
|
38 |
+
BenthamVol1.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
39 |
+
BenthamVol10.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
40 |
+
BenthamVol11.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
41 |
+
BenthamVol2.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
42 |
+
BenthamVol3.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
43 |
+
BenthamVol4.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
44 |
+
BenthamVol5.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
45 |
+
BenthamVol6.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
46 |
+
BenthamVol7.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
47 |
+
BenthamVol8.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
48 |
+
BenthamVol9.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
|
BenthamVol1.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c6fe2e819284fbe21628faae4a7570515da7d2a7aa4a04421d0e3573fb68d82
|
3 |
+
size 4623992
|
BenthamVol10.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:401e0be610adc3c49feb971639263c1ab9fc2c79af5050cb4d76cb988cfbc86d
|
3 |
+
size 3421267
|
BenthamVol11.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1c209df71f83dcef64804792358da23beef725366fb098e72f8ac9631f06d00
|
3 |
+
size 1045494
|
BenthamVol2.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d661fde64870b59cc00e0ecc049585b612b2bc96d5845ef60120c401c87204b
|
3 |
+
size 3756476
|
BenthamVol3.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b39d69c5e4848bbec153e74291c723d73e5be07e3ca9efdd1dd04285c1afeb9
|
3 |
+
size 4325352
|
BenthamVol4.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be5974483aa1621f647578b3eac9140d7684818e94a258e7cd37961a340d4d90
|
3 |
+
size 4533915
|
BenthamVol5.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b99a29acca810e322c6186e5068ab2fc643293e7f4a784e2668cc6ae2206a1d5
|
3 |
+
size 4506297
|
BenthamVol6.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c5d377a1ce60f89dbc47d27551baeacd1a0a402e4c9ad157e0fa8e5fa5ac101
|
3 |
+
size 3794279
|
BenthamVol7.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6bac174be0b15b26587a39472ae63f84f638d0f974ce5dd76791f36fb6be4f2
|
3 |
+
size 4135943
|
BenthamVol8.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5e5f862eed5960a4908b09692d617f7fedacb0e0da4b2bc84a884b76a9c4c0f
|
3 |
+
size 4983726
|
BenthamVol9.LibertyFund.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d45fb184f6ed0678d923e876e15497bed5997f93aabe8686f05b70e57a9f506c
|
3 |
+
size 4814191
|
bentham_chunked/cache-5cbdbfe773332528.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b371ef80caf63f5aea26ab76e2ecd79453d826f65aa9bd42825bc87af742dd26
|
3 |
+
size 188607104
|
bentham_chunked/cache-6dab9c48ee95e985.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa7b88d9c574aac8d5503bdc9863198ac910e2735d9c1333e8fbaa61f08e0c06
|
3 |
+
size 46344
|
bentham_chunked/cache-7a28cb91cfcdeb71.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0704409c709d1499ee3670c941af0263ae5eb383b3fd6455468cf65e7e0d0b3
|
3 |
+
size 414168
|
bentham_chunked/cache-93db310462e229c3.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25b8ba600c274704362f1a69ce9c709ec2803649a92c1b2ef128e19000db6294
|
3 |
+
size 94654336
|
bentham_chunked/cache-d9fcbe649ebfa850.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4557dfea9c2539d8b2102051c2f5bb6c0de473be66e27aea5ff627514a5abf13
|
3 |
+
size 414168
|
bentham_chunked/cache-dd243b4e82321451.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d6acc5eb21c051c097de2d2d8aa627651b635ffb95ced31ca88f8212ee09a6a
|
3 |
+
size 46344
|
bentham_chunked/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b92bd87e39aded15d180303639f11fdf680bdbcd5ffa832e2bc9c5371e1e43e0
|
3 |
+
size 33486400
|
bentham_chunked/dataset_info.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"citation": "",
|
3 |
+
"description": "",
|
4 |
+
"features": {
|
5 |
+
"text": {
|
6 |
+
"dtype": "string",
|
7 |
+
"_type": "Value"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"homepage": "",
|
11 |
+
"license": ""
|
12 |
+
}
|
bentham_chunked/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "cb9daa9a8124efb5",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": null
|
13 |
+
}
|
bentham_text.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8fd528752a5520fb4dee602649217bf832484158cedba9faae8a11fd52c6e744
|
3 |
+
size 35431921
|
bentham_text_sm.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
finetune_bentham.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
2 |
+
from datasets import load_dataset, load_from_disk
|
3 |
+
|
4 |
+
dataset = load_from_disk('bentham_chunked')
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
7 |
+
#tokenizer.save_pretrained("results/tokenizer")
|
8 |
+
|
9 |
+
|
10 |
+
model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
11 |
+
|
12 |
+
if tokenizer.pad_token is None:
|
13 |
+
print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.")
|
14 |
+
tokenizer.pad_token = tokenizer.eos_token
|
15 |
+
|
16 |
+
def tokenize_function(examples):
|
17 |
+
|
18 |
+
tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)
|
19 |
+
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
|
20 |
+
|
21 |
+
return tokenized_inputs
|
22 |
+
|
23 |
+
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
|
24 |
+
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
|
25 |
+
|
26 |
+
train_dataset = train_test_split['train']
|
27 |
+
eval_dataset = train_test_split['test']
|
28 |
+
training_args = TrainingArguments(
|
29 |
+
output_dir="./results",
|
30 |
+
num_train_epochs=3,
|
31 |
+
per_device_train_batch_size=1,
|
32 |
+
warmup_steps=500,
|
33 |
+
weight_decay=0.01,
|
34 |
+
logging_dir="./logs",
|
35 |
+
logging_steps=10,
|
36 |
+
save_strategy="steps",
|
37 |
+
save_steps=500,
|
38 |
+
save_total_limit=2,
|
39 |
+
use_cpu=True)
|
40 |
+
trainer = Trainer(
|
41 |
+
model=model,
|
42 |
+
args=training_args,
|
43 |
+
train_dataset=train_dataset,
|
44 |
+
eval_dataset=eval_dataset,
|
45 |
+
)
|
46 |
+
#below has been modified because i ran out of disk storage initially so had to resume and adjust the
|
47 |
+
trainer.train()#resume_from_checkpoint="./results/checkpoint-10000")
|
48 |
+
|
49 |
+
#tokenizer.save_pretrained(".results/")
|
pdf_extract.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdfminer.high_level import extract_text
|
2 |
+
|
3 |
+
import re
|
4 |
+
bentham_texts = []
|
5 |
+
import glob
|
6 |
+
def extract_text_from_pdf(pdf_path):
|
7 |
+
text = extract_text(pdf_path)
|
8 |
+
return text
|
9 |
+
|
10 |
+
bentham_pdfs = glob.glob('./Bentham*.pdf')
|
11 |
+
for pdf in bentham_pdfs:
|
12 |
+
print(pdf)
|
13 |
+
with open(pdf, 'rb') as f:
|
14 |
+
text = extract_text_from_pdf(f)
|
15 |
+
bentham_texts.append(text)
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
bentham_text_string = ' '.join(bentham_texts)
|
20 |
+
with open('bentham_text.txt', 'w') as f:
|
21 |
+
f.write(bentham_text_string)
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
##
|
27 |
+
bentham_text_string =''
|
28 |
+
with open('bentham_text.txt', 'r') as f:
|
29 |
+
bentham_text_strings = f.readlines()
|
30 |
+
bentham_text_string = ''.join(bentham_text_strings)
|
31 |
+
|
32 |
+
import re
|
33 |
+
|
34 |
+
def clean_text(text):
|
35 |
+
cleaned_text = re.sub(r'§\s*\d+\.', '', text)
|
36 |
+
|
37 |
+
# Step 2: Remove the unwanted patterns
|
38 |
+
# Removing patterns like "PLL v6.0 (generated September, 2011)" and URLs
|
39 |
+
cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
|
40 |
+
cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL)
|
41 |
+
cleaned_text = re.sub(r'https?://\S+', '', cleaned_text)
|
42 |
+
|
43 |
+
# Removing "Online Library of Liberty" lines that might not fit the exact previous pattern
|
44 |
+
cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
|
45 |
+
cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL)
|
46 |
+
cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text) # Removes non-printable characters
|
47 |
+
cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text) # Removes escaped special characters like \n, \t, \r
|
48 |
+
patterns_to_remove = [
|
49 |
+
r'^\s*$', # Empty lines
|
50 |
+
r'^\s*\d+\s*$', # Standalone numeric lines
|
51 |
+
r'\[Back to Table of Contents\]', # Specific placeholders
|
52 |
+
]
|
53 |
+
for pattern in patterns_to_remove:
|
54 |
+
cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE)
|
55 |
+
return cleaned_text
|
56 |
+
cleaned_lines = []
|
57 |
+
for line in bentham_text_strings:
|
58 |
+
cleaned_line = clean_text(line)
|
59 |
+
if cleaned_line != '':
|
60 |
+
cleaned_lines.append(cleaned_line)
|
61 |
+
|
62 |
+
|
63 |
+
def split_into_chunks(text, chunk_size=100):
|
64 |
+
"""
|
65 |
+
Split the text into chunks of approximately `chunk_size` words.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
text (str): The input text to split.
|
69 |
+
chunk_size (int): The desired chunk size in words.
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
list of str: A list of text chunks.
|
73 |
+
"""
|
74 |
+
# Split the text into words
|
75 |
+
words = text.split()
|
76 |
+
|
77 |
+
# Create chunks of approximately `chunk_size` words
|
78 |
+
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
79 |
+
|
80 |
+
return chunks
|
81 |
+
|
82 |
+
chunks = split_into_chunks((' ').join(cleaned_lines), 100)
|
83 |
+
|
84 |
+
from datasets import Dataset
|
85 |
+
|
86 |
+
# Assuming `chunks` is the list of text chunks you created earlier
|
87 |
+
data = {'text': chunks}
|
88 |
+
new_dataset = Dataset.from_dict(data)
|
89 |
+
new_dataset.save_to_disk('./bentham_chunked')
|
90 |
+
|
text_gen.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
|
4 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
5 |
+
from datasets import load_dataset, load_from_disk
|
6 |
+
|
7 |
+
#post training
|
8 |
+
model_path = "./results/checkpoint-152000"
|
9 |
+
model = AutoModelForCausalLM.from_pretrained(model_path)
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
11 |
+
|
12 |
+
|
13 |
+
input_text = """
|
14 |
+
H: After all that we have gone through, the truth is written literally and not literately. i have gazed navally and looked to the stars above.
|
15 |
+
How would you consider the case of man today amidst all this chaotica?
|
16 |
+
|
17 |
+
B:
|
18 |
+
"""
|
19 |
+
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
20 |
+
output = model.generate(
|
21 |
+
input_ids=tokenizer.encode(input_text, return_tensors="pt"),
|
22 |
+
max_length=1000,
|
23 |
+
num_return_sequences=1,
|
24 |
+
no_repeat_ngram_size=5,
|
25 |
+
temperature=0.9,
|
26 |
+
top_k=50,
|
27 |
+
top_p=0.98,
|
28 |
+
do_sample=True,
|
29 |
+
num_beams=10
|
30 |
+
)
|
31 |
+
|
32 |
+
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
|
33 |
+
print(decoded_output)
|