Spaces:

hexml
/

chat-with-pdf

Sleeping

App Files Files Community

JaiSurya commited on May 16, 2024

Commit

9f493b6

1 Parent(s): b06b29e

Initial app setup

Browse files

Files changed (11) hide show

.ipynb_checkpoints/README-checkpoint.md +12 -0
.ipynb_checkpoints/app-checkpoint.py +32 -0
.ipynb_checkpoints/core-checkpoint.py +14 -0
.ipynb_checkpoints/embeddings-checkpoint.py +120 -0
.ipynb_checkpoints/rag-checkpoint.py +91 -0
.ipynb_checkpoints/requirements-checkpoint.txt +9 -0
app.py +30 -61
core.py +14 -0
embeddings.py +120 -0
rag.py +91 -0
requirements.txt +9 -1

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Chat With Pdf
+emoji: 💬
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+app_file: app.py
+pinned: false
+license: mit
+---
+An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

.ipynb_checkpoints/app-checkpoint.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+import core
+def process_pdf_and_text(pdf_file_path, user_text):
+    print(f"[INFO] The pdf file is in the {pdf_file_path}")
+    if not hasattr(process_pdf_and_text,"_called"):
+        core.process_pdf(pdf_file_path)
+        process_pdf_and_text._called = True
+    result = core.process_query(user_text)
+    return result
+def main():
+    # input components
+    pdf_input = gr.File(label="Upload PDF File")
+    text_input = gr.TextArea(label="Enter the query")
+    # output component
+    output_text = gr.TextArea()
+    # app interface
+    demo = gr.Interface(
+        fn=process_pdf_and_text,
+        inputs=[pdf_input, text_input],
+        outputs=output_text,
+        title="Chat With PDF",
+        description="RAG based Chat with pdf"
+    )
+    demo.launch()
+if __name__ == "__main__":
+    main()

.ipynb_checkpoints/core-checkpoint.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from embeddings import Embeddings
+from rag import RAG
+rag_ = None
+def process_pdf(file:str):
+    emb = Embeddings(file)
+    emb.save_the_embeddings()
+    global rag_
+    rag_ = RAG()
+def process_query(user_text:str):
+    global rag_
+    return rag_.query(user_text)

.ipynb_checkpoints/embeddings-checkpoint.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# This file contains all the functionalities from the pdf extraction to the embeddings
+import os
+import re
+from tqdm import tqdm
+from spacy.lang.en import English
+import fitz
+import pandas as pd
+import torch
+from sentence_transformers import SentenceTransformer
+class Embeddings:
+    def __init__(self,pdf_file_path : str):
+        self.pdf_file_path = pdf_file_path
+        self.embedding_model_name = "all-mpnet-base-v2"
+        self.device = self.get_device()
+    def get_device(self) -> str:
+        """ Returns the device """
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        return device
+    def text_formatter(self,text : str) -> str:
+        """ Convert the text that contains the /n with the space"""
+        formatted_text = text.replace('\n',' ').strip()
+        return formatted_text
+    def count_and_split_sentence(self,text : str) -> (int,list[str]):
+        """To count and split the sentences from the given text """
+        nlp = English()
+        nlp.add_pipe("sentencizer")
+        list_of_sentences = list(nlp(text).sents)
+        list_of_sentences = [str(sentence) for sentence in list_of_sentences]
+        return len(list_of_sentences),list_of_sentences
+    def open_pdf(self):
+        """convert the pdf into dict dtype"""
+        doc = fitz.open(self.pdf_file_path)
+        data = []
+        print("[INFO] Converting the pdf into dict dtype")
+        for page_number,page in tqdm(enumerate(doc)):
+            text = page.get_text()
+            text = self.text_formatter(text = text)
+            sentence_count,sentences = self.count_and_split_sentence(text)
+            data.append(
+                {
+                    "page_number" : page_number,
+                    "char_count" : len(text),
+                    "word_count" : len(text.split(" ")),
+                    "sentence_count" : sentence_count,
+                    "token_count" : len(text) / 4,
+                    "sentence" : sentences,
+                    "text" : text
+                }
+            )
+        return data
+    def split_the_array(self,array_list : list,
+                    chunk_length : int) -> list[list[str]]:
+        """Split the array of sentences into groups of chunks"""
+        return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]
+    def convert_to_chunk(self,chunk_size : int = 10) -> list[dict]:
+        """ Convert the sentences into chunks """
+        pages_and_texts = self.open_pdf()
+        pages_and_chunks = []
+        # splitting the chunks
+        print("[INFO] Splitting the sentences ")
+        for item in tqdm(pages_and_texts):
+            item["sentence_chunks"] = self.split_the_array(item["sentence"],chunk_size)
+            item["chunk_count"] = len(item["sentence_chunks"])
+        # splitting the chunks
+        print("[INFO] Splitting into chunks ")
+        for item in tqdm(pages_and_texts):
+            for chunks in item["sentence_chunks"]:
+                d = {}
+                joined_sentence = "".join(chunks).replace("  "," ").strip()
+                joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends
+                if len(joined_sentence) / 4 > 30:
+                    d["page_number"] = item["page_number"]
+                    d["sentence_chunk"] = joined_sentence
+                    # stats
+                    d["char_count"] = len(joined_sentence)
+                    d["word_count"] = len(list(joined_sentence.split(" ")))
+                    d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word
+                    pages_and_chunks.append(d)
+        return pages_and_chunks
+    def convert_to_embedds(self,chunk_size = 10) -> list[dict] :
+        data = self.convert_to_chunk(chunk_size)
+        embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
+        print("[INFO] Converting into embeddings ")
+        for item in tqdm(data):
+            item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)
+        return data
+    def save_the_embeddings(self,filename : str = "embeddings.csv",data : list[dict] = None):
+        embedd_file = filename
+        if data is None:
+            data = self.convert_to_embedds()
+        dataframe = pd.DataFrame(data)
+        dataframe.to_csv(embedd_file,index = False)

.ipynb_checkpoints/rag-checkpoint.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# this python file contains all steps from the retrieval to generation code
+import torch
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer,util
+from transformers import AutoTokenizer , AutoModelForCausalLM
+class RAG:
+    def __init__(self):
+        self.model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.embedding_model_name = "all-mpnet-base-v2"
+        self.embeddings_filename = "embeddings.csv"
+        self.data_pd = pd.read_csv(self.embeddings_filename)
+        self.data_dict = pd.read_csv(self.embeddings_filename).to_dict(orient='records')
+        self.data_embeddings = self.get_embeddings()
+        self.embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
+        # Tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        # LLM
+        self.llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=self.model_id,
+                                                         torch_dtype=torch.float16).to(self.device)
+    def get_embeddings(self) -> list:
+        """Returns the embeddings from the csv file"""
+        data_embeddings = []
+        for tensor_str in self.data_pd["embeddings"]:
+            values_str = tensor_str.split("[")[1].split("]")[0]
+            values_list = [float(val) for val in values_str.split(",")]
+            tensor_result = torch.tensor(values_list)
+            data_embeddings.append(tensor_result)
+        data_embeddings = torch.stack(data_embeddings).to(self.device)
+        return data_embeddings
+    def retrieve_relevant_resource(self,user_query : str , k = 5):
+        """Function to retrieve relevant resource"""
+        query_embedding = self.embedding_model.encode(user_query, convert_to_tensor = True).to(self.device)
+        dot_score = util.dot_score( a = query_embedding, b = self.data_embeddings)[0]
+        score , idx = torch.topk(dot_score,k=k)
+        return score,idx
+    def prompt_formatter(self,query: str, context_items: list[dict]) -> str:
+        """
+        Augments query with text-based context from context_items.
+        """
+        # Join context items into one dotted paragraph
+        context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
+        base_prompt = """Based on the following context items, please answer the query.
+    \nNow use the following context items to answer the user query:
+    {context}
+    \nRelevant passages: <extract relevant passages from the context here>
+    User query: {query}
+    Answer:"""
+        # Update base prompt with context items and query
+        base_prompt = base_prompt.format(context=context, query=query)
+        # Create prompt template for instruction-tuned model
+        dialogue_template = [
+            {"role": "user",
+            "content": base_prompt}
+        ]
+        # Apply the chat template
+        prompt = self.tokenizer.apply_chat_template(conversation=dialogue_template,
+                                              tokenize=False,
+                                              add_generation_prompt=True)
+        return prompt
+    def query(self,user_text : str):
+        scores, indices = self.retrieve_relevant_resource(user_text)
+        context_items = [self.data_dict[i] for i in indices]
+        prompt = self.prompt_formatter(query=user_text,context_items=context_items)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        outputs = self.llm_model.generate(**input_ids,max_new_tokens=256)
+        output_text = self.tokenizer.decode(outputs[0])
+        output_text = output_text.split("<|assistant|>")
+        output_text = output_text[1].split("</s>")[0]
+        return output_text

.ipynb_checkpoints/requirements-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+numpy
+pandas
+spacy
+tqdm
+PyMuPDF
+torch
+sentence_transformers
+transformers
+gradio

app.py CHANGED Viewed

@@ -1,63 +1,32 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import core
+def process_pdf_and_text(pdf_file_path, user_text):
+    print(f"[INFO] The pdf file is in the {pdf_file_path}")
+    if not hasattr(process_pdf_and_text,"_called"):
+        core.process_pdf(pdf_file_path)
+        process_pdf_and_text._called = True
+    result = core.process_query(user_text)
+    return result
+def main():
+    # input components
+    pdf_input = gr.File(label="Upload PDF File")
+    text_input = gr.TextArea(label="Enter the query")
+    # output component
+    output_text = gr.TextArea()
+    # app interface
+    demo = gr.Interface(
+        fn=process_pdf_and_text,
+        inputs=[pdf_input, text_input],
+        outputs=output_text,
+        title="Chat With PDF",
+        description="RAG based Chat with pdf"
+    )
+    demo.launch()
 if __name__ == "__main__":
+    main()

core.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from embeddings import Embeddings
+from rag import RAG
+rag_ = None
+def process_pdf(file:str):
+    emb = Embeddings(file)
+    emb.save_the_embeddings()
+    global rag_
+    rag_ = RAG()
+def process_query(user_text:str):
+    global rag_
+    return rag_.query(user_text)

embeddings.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# This file contains all the functionalities from the pdf extraction to the embeddings
+import os
+import re
+from tqdm import tqdm
+from spacy.lang.en import English
+import fitz
+import pandas as pd
+import torch
+from sentence_transformers import SentenceTransformer
+class Embeddings:
+    def __init__(self,pdf_file_path : str):
+        self.pdf_file_path = pdf_file_path
+        self.embedding_model_name = "all-mpnet-base-v2"
+        self.device = self.get_device()
+    def get_device(self) -> str:
+        """ Returns the device """
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        return device
+    def text_formatter(self,text : str) -> str:
+        """ Convert the text that contains the /n with the space"""
+        formatted_text = text.replace('\n',' ').strip()
+        return formatted_text
+    def count_and_split_sentence(self,text : str) -> (int,list[str]):
+        """To count and split the sentences from the given text """
+        nlp = English()
+        nlp.add_pipe("sentencizer")
+        list_of_sentences = list(nlp(text).sents)
+        list_of_sentences = [str(sentence) for sentence in list_of_sentences]
+        return len(list_of_sentences),list_of_sentences
+    def open_pdf(self):
+        """convert the pdf into dict dtype"""
+        doc = fitz.open(self.pdf_file_path)
+        data = []
+        print("[INFO] Converting the pdf into dict dtype")
+        for page_number,page in tqdm(enumerate(doc)):
+            text = page.get_text()
+            text = self.text_formatter(text = text)
+            sentence_count,sentences = self.count_and_split_sentence(text)
+            data.append(
+                {
+                    "page_number" : page_number,
+                    "char_count" : len(text),
+                    "word_count" : len(text.split(" ")),
+                    "sentence_count" : sentence_count,
+                    "token_count" : len(text) / 4,
+                    "sentence" : sentences,
+                    "text" : text
+                }
+            )
+        return data
+    def split_the_array(self,array_list : list,
+                    chunk_length : int) -> list[list[str]]:
+        """Split the array of sentences into groups of chunks"""
+        return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]
+    def convert_to_chunk(self,chunk_size : int = 10) -> list[dict]:
+        """ Convert the sentences into chunks """
+        pages_and_texts = self.open_pdf()
+        pages_and_chunks = []
+        # splitting the chunks
+        print("[INFO] Splitting the sentences ")
+        for item in tqdm(pages_and_texts):
+            item["sentence_chunks"] = self.split_the_array(item["sentence"],chunk_size)
+            item["chunk_count"] = len(item["sentence_chunks"])
+        # splitting the chunks
+        print("[INFO] Splitting into chunks ")
+        for item in tqdm(pages_and_texts):
+            for chunks in item["sentence_chunks"]:
+                d = {}
+                joined_sentence = "".join(chunks).replace("  "," ").strip()
+                joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends
+                if len(joined_sentence) / 4 > 30:
+                    d["page_number"] = item["page_number"]
+                    d["sentence_chunk"] = joined_sentence
+                    # stats
+                    d["char_count"] = len(joined_sentence)
+                    d["word_count"] = len(list(joined_sentence.split(" ")))
+                    d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word
+                    pages_and_chunks.append(d)
+        return pages_and_chunks
+    def convert_to_embedds(self,chunk_size = 10) -> list[dict] :
+        data = self.convert_to_chunk(chunk_size)
+        embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
+        print("[INFO] Converting into embeddings ")
+        for item in tqdm(data):
+            item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)
+        return data
+    def save_the_embeddings(self,filename : str = "embeddings.csv",data : list[dict] = None):
+        embedd_file = filename
+        if data is None:
+            data = self.convert_to_embedds()
+        dataframe = pd.DataFrame(data)
+        dataframe.to_csv(embedd_file,index = False)

rag.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# this python file contains all steps from the retrieval to generation code
+import torch
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer,util
+from transformers import AutoTokenizer , AutoModelForCausalLM
+class RAG:
+    def __init__(self):
+        self.model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.embedding_model_name = "all-mpnet-base-v2"
+        self.embeddings_filename = "embeddings.csv"
+        self.data_pd = pd.read_csv(self.embeddings_filename)
+        self.data_dict = pd.read_csv(self.embeddings_filename).to_dict(orient='records')
+        self.data_embeddings = self.get_embeddings()
+        self.embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
+        # Tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        # LLM
+        self.llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=self.model_id,
+                                                         torch_dtype=torch.float16).to(self.device)
+    def get_embeddings(self) -> list:
+        """Returns the embeddings from the csv file"""
+        data_embeddings = []
+        for tensor_str in self.data_pd["embeddings"]:
+            values_str = tensor_str.split("[")[1].split("]")[0]
+            values_list = [float(val) for val in values_str.split(",")]
+            tensor_result = torch.tensor(values_list)
+            data_embeddings.append(tensor_result)
+        data_embeddings = torch.stack(data_embeddings).to(self.device)
+        return data_embeddings
+    def retrieve_relevant_resource(self,user_query : str , k = 5):
+        """Function to retrieve relevant resource"""
+        query_embedding = self.embedding_model.encode(user_query, convert_to_tensor = True).to(self.device)
+        dot_score = util.dot_score( a = query_embedding, b = self.data_embeddings)[0]
+        score , idx = torch.topk(dot_score,k=k)
+        return score,idx
+    def prompt_formatter(self,query: str, context_items: list[dict]) -> str:
+        """
+        Augments query with text-based context from context_items.
+        """
+        # Join context items into one dotted paragraph
+        context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
+        base_prompt = """Based on the following context items, please answer the query.
+    \nNow use the following context items to answer the user query:
+    {context}
+    \nRelevant passages: <extract relevant passages from the context here>
+    User query: {query}
+    Answer:"""
+        # Update base prompt with context items and query
+        base_prompt = base_prompt.format(context=context, query=query)
+        # Create prompt template for instruction-tuned model
+        dialogue_template = [
+            {"role": "user",
+            "content": base_prompt}
+        ]
+        # Apply the chat template
+        prompt = self.tokenizer.apply_chat_template(conversation=dialogue_template,
+                                              tokenize=False,
+                                              add_generation_prompt=True)
+        return prompt
+    def query(self,user_text : str):
+        scores, indices = self.retrieve_relevant_resource(user_text)
+        context_items = [self.data_dict[i] for i in indices]
+        prompt = self.prompt_formatter(query=user_text,context_items=context_items)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        outputs = self.llm_model.generate(**input_ids,max_new_tokens=256)
+        output_text = self.tokenizer.decode(outputs[0])
+        output_text = output_text.split("<|assistant|>")
+        output_text = output_text[1].split("</s>")[0]
+        return output_text

requirements.txt CHANGED Viewed

	@@ -1 +1,9 @@
1	- ~~huggingface_hub==0.22.2~~

+numpy
+pandas
+spacy
+tqdm
+PyMuPDF
+torch
+sentence_transformers
+transformers
+gradio