caug37 commited on Mar 24, 2024

Commit

c6bb4cd

verified ·

1 Parent(s): 08caf02

Upload 25 files

Browse files

Files changed (26) hide show

.gitattributes +12 -0
BenthamVol1.LibertyFund.pdf +3 -0
BenthamVol10.LibertyFund.pdf +3 -0
BenthamVol11.LibertyFund.pdf +3 -0
BenthamVol2.LibertyFund.pdf +3 -0
BenthamVol3.LibertyFund.pdf +3 -0
BenthamVol4.LibertyFund.pdf +3 -0
BenthamVol5.LibertyFund.pdf +3 -0
BenthamVol6.LibertyFund.pdf +3 -0
BenthamVol7.LibertyFund.pdf +3 -0
BenthamVol8.LibertyFund.pdf +3 -0
BenthamVol9.LibertyFund.pdf +3 -0
bentham_chunked/cache-5cbdbfe773332528.arrow +3 -0
bentham_chunked/cache-6dab9c48ee95e985.arrow +3 -0
bentham_chunked/cache-7a28cb91cfcdeb71.arrow +3 -0
bentham_chunked/cache-93db310462e229c3.arrow +3 -0
bentham_chunked/cache-d9fcbe649ebfa850.arrow +3 -0
bentham_chunked/cache-dd243b4e82321451.arrow +3 -0
bentham_chunked/data-00000-of-00001.arrow +3 -0
bentham_chunked/dataset_info.json +12 -0
bentham_chunked/state.json +13 -0
bentham_text.txt +3 -0
bentham_text_sm.txt +0 -0
finetune_bentham.py +49 -0
pdf_extract.py +90 -0
text_gen.py +33 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 model.gguf filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 model.gguf filter=lfs diff=lfs merge=lfs -text
+bentham_text.txt filter=lfs diff=lfs merge=lfs -text
+BenthamVol1.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol10.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol11.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol2.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol3.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol4.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol5.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol6.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol7.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol8.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text
+BenthamVol9.LibertyFund.pdf filter=lfs diff=lfs merge=lfs -text

BenthamVol1.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c6fe2e819284fbe21628faae4a7570515da7d2a7aa4a04421d0e3573fb68d82
+size 4623992

BenthamVol10.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:401e0be610adc3c49feb971639263c1ab9fc2c79af5050cb4d76cb988cfbc86d
+size 3421267

BenthamVol11.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c209df71f83dcef64804792358da23beef725366fb098e72f8ac9631f06d00
+size 1045494

BenthamVol2.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d661fde64870b59cc00e0ecc049585b612b2bc96d5845ef60120c401c87204b
+size 3756476

BenthamVol3.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b39d69c5e4848bbec153e74291c723d73e5be07e3ca9efdd1dd04285c1afeb9
+size 4325352

BenthamVol4.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be5974483aa1621f647578b3eac9140d7684818e94a258e7cd37961a340d4d90
+size 4533915

BenthamVol5.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b99a29acca810e322c6186e5068ab2fc643293e7f4a784e2668cc6ae2206a1d5
+size 4506297

BenthamVol6.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c5d377a1ce60f89dbc47d27551baeacd1a0a402e4c9ad157e0fa8e5fa5ac101
+size 3794279

BenthamVol7.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6bac174be0b15b26587a39472ae63f84f638d0f974ce5dd76791f36fb6be4f2
+size 4135943

BenthamVol8.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5e5f862eed5960a4908b09692d617f7fedacb0e0da4b2bc84a884b76a9c4c0f
+size 4983726

BenthamVol9.LibertyFund.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d45fb184f6ed0678d923e876e15497bed5997f93aabe8686f05b70e57a9f506c
+size 4814191

bentham_chunked/cache-5cbdbfe773332528.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b371ef80caf63f5aea26ab76e2ecd79453d826f65aa9bd42825bc87af742dd26
+size 188607104

bentham_chunked/cache-6dab9c48ee95e985.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa7b88d9c574aac8d5503bdc9863198ac910e2735d9c1333e8fbaa61f08e0c06
+size 46344

bentham_chunked/cache-7a28cb91cfcdeb71.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0704409c709d1499ee3670c941af0263ae5eb383b3fd6455468cf65e7e0d0b3
+size 414168

bentham_chunked/cache-93db310462e229c3.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25b8ba600c274704362f1a69ce9c709ec2803649a92c1b2ef128e19000db6294
+size 94654336

bentham_chunked/cache-d9fcbe649ebfa850.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4557dfea9c2539d8b2102051c2f5bb6c0de473be66e27aea5ff627514a5abf13
+size 414168

bentham_chunked/cache-dd243b4e82321451.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d6acc5eb21c051c097de2d2d8aa627651b635ffb95ced31ca88f8212ee09a6a
+size 46344

bentham_chunked/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b92bd87e39aded15d180303639f11fdf680bdbcd5ffa832e2bc9c5371e1e43e0
+size 33486400

bentham_chunked/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

bentham_chunked/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "cb9daa9a8124efb5",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

bentham_text.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fd528752a5520fb4dee602649217bf832484158cedba9faae8a11fd52c6e744
+size 35431921

bentham_text_sm.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

finetune_bentham.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
+from datasets import load_dataset, load_from_disk
+dataset = load_from_disk('bentham_chunked')
+tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
+#tokenizer.save_pretrained("results/tokenizer")
+model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
+if tokenizer.pad_token is None:
+    print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.")
+    tokenizer.pad_token = tokenizer.eos_token
+def tokenize_function(examples):
+    tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)
+    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
+    return  tokenized_inputs
+tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
+train_dataset = train_test_split['train']
+eval_dataset = train_test_split['test']
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=1,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir="./logs",
+    logging_steps=10,
+    save_strategy="steps",
+    save_steps=500,
+    save_total_limit=2,
+    use_cpu=True)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+)
+#below has been modified because i ran out of disk storage initially so had to resume and adjust the
+trainer.train()#resume_from_checkpoint="./results/checkpoint-10000")
+#tokenizer.save_pretrained(".results/")

pdf_extract.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from pdfminer.high_level import extract_text
+import re
+bentham_texts = []
+import glob
+def extract_text_from_pdf(pdf_path):
+    text = extract_text(pdf_path)
+    return text
+bentham_pdfs = glob.glob('./Bentham*.pdf')
+for pdf in bentham_pdfs:
+    print(pdf)
+    with open(pdf, 'rb') as f:
+        text = extract_text_from_pdf(f)
+        bentham_texts.append(text)
+bentham_text_string = ' '.join(bentham_texts)
+with open('bentham_text.txt', 'w') as f:
+    f.write(bentham_text_string)
+##
+bentham_text_string =''
+with open('bentham_text.txt', 'r') as f:
+    bentham_text_strings = f.readlines()
+bentham_text_string = ''.join(bentham_text_strings)
+import re
+def clean_text(text):
+    cleaned_text = re.sub(r'§\s*\d+\.', '', text)
+    # Step 2: Remove the unwanted patterns
+    # Removing patterns like "PLL v6.0 (generated September, 2011)" and URLs
+    cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
+    cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL)
+    cleaned_text = re.sub(r'https?://\S+', '', cleaned_text)
+    # Removing "Online Library of Liberty" lines that might not fit the exact previous pattern
+    cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
+    cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL)
+    cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text)  # Removes non-printable characters
+    cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text)  # Removes escaped special characters like \n, \t, \r
+    patterns_to_remove = [
+        r'^\s*$',  # Empty lines
+        r'^\s*\d+\s*$',  # Standalone numeric lines
+        r'\[Back to Table of Contents\]',  # Specific placeholders
+    ]
+    for pattern in patterns_to_remove:
+        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE)
+    return cleaned_text
+cleaned_lines = []
+for line in bentham_text_strings:
+    cleaned_line = clean_text(line)
+    if cleaned_line != '':
+        cleaned_lines.append(cleaned_line)
+def split_into_chunks(text, chunk_size=100):
+    """
+    Split the text into chunks of approximately `chunk_size` words.
+    Args:
+    text (str): The input text to split.
+    chunk_size (int): The desired chunk size in words.
+    Returns:
+    list of str: A list of text chunks.
+    """
+    # Split the text into words
+    words = text.split()
+    # Create chunks of approximately `chunk_size` words
+    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks
+chunks = split_into_chunks((' ').join(cleaned_lines), 100)
+from datasets import Dataset
+# Assuming `chunks` is the list of text chunks you created earlier
+data = {'text': chunks}
+new_dataset = Dataset.from_dict(data)
+new_dataset.save_to_disk('./bentham_chunked')

text_gen.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
+from datasets import load_dataset, load_from_disk
+#post training
+model_path = "./results/checkpoint-152000"
+model = AutoModelForCausalLM.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
+input_text = """
+H: After all that we have gone through, the truth is written literally and not literately. i have gazed navally and looked to the stars above.
+How would you consider the case of man today amidst all this chaotica?
+B:
+"""
+input_ids = tokenizer.encode(input_text, return_tensors='pt')
+output = model.generate(
+    input_ids=tokenizer.encode(input_text, return_tensors="pt"),
+    max_length=1000,
+    num_return_sequences=1,
+    no_repeat_ngram_size=5,
+    temperature=0.9,
+    top_k=50,
+    top_p=0.98,
+    do_sample=True,
+    num_beams=10
+)
+decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
+print(decoded_output)