Spaces:

SimaFarazi
/

Backend_c

Sleeping

App Files Files Community

SimaFarazi commited on Nov 20, 2024

Commit

dfe66cd

1 Parent(s): 605fc97

add data indexing file and required modifications in the other files

Browse files

Files changed (6) hide show

.gitignore +3 -1
app_stream_rag/app/chains.py +29 -1
app_stream_rag/app/data_indexing.py +173 -0
app_stream_rag/app/main.py +33 -1
app_stream_rag/app/prompts.py +37 -0
app_stream_rag/app/test_components.ipynb +98 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 .*
 !/.gitignore
-*.db

 .*
 !/.gitignore
+*.db
+code_data
+sources.txt

app_stream_rag/app/chains.py CHANGED Viewed

@@ -4,13 +4,22 @@ from prompts import (
     raw_prompt,
     tokenizer,
     raw_prompt_formatted,
-    history_prompt_formatted
 )
 import schemas
 from dotenv import load_dotenv
 load_dotenv()
 # Instantiate HuggingFace endpoint with Llama model
 llm = HuggingFaceEndpoint(
     repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
@@ -30,3 +39,22 @@ formatted_chain = (raw_prompt_formatted | llm).with_types(input_type=schemas.Use
 # Create history_chain by piping raw_prompt_formatted and the LLM endpoint.
 history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)

     raw_prompt,
     tokenizer,
     raw_prompt_formatted,
+    history_prompt_formatted,
+    standalone_prompt_formatted,
+    rag_prompt_formatted,
+    format_context
 )
 import schemas
 from dotenv import load_dotenv
 load_dotenv()
+from langchain_core.runnables import RunnablePassthrough
+from data_indexing import DataIndexer
+data_indexer = DataIndexer()
 # Instantiate HuggingFace endpoint with Llama model
 llm = HuggingFaceEndpoint(
     repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
 # Create history_chain by piping raw_prompt_formatted and the LLM endpoint.
 history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
+# Construct the standalone_chain by piping standalone_prompt_formatted with the LLM
+standalone_chain = (standalone_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
+input_1 = RunnablePassthrough.assign(new_question=standalone_chain)
+input_2 = {
+    'context': lambda x: format_context(data_indexer.search(x['new_question'])),
+    'standalone_question': lambda x: x['new_question']
+}
+input_to_rag_chain = input_1 | input_2
+# Use input_to_rag_chain, rag_prompt_formatted,
+# HistoryInput and the LLM to build the rag_chain.
+rag_chain = (input_to_rag_chain | rag_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
+# TODO:  Implement the filtered_rag_chain. It should be the
+# same as the rag_chain but with hybrid_search = True.
+filtered_rag_chain = None

app_stream_rag/app/data_indexing.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv())
+import uuid
+from pathlib import Path
+from pinecone.grpc import PineconeGRPC as Pinecone
+from pinecone import ServerlessSpec
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import Chroma
+current_dir = Path(__file__).resolve().parent
+class DataIndexer:
+    source_file =  os.path.join(current_dir, 'sources.txt')
+    def __init__(self, index_name='langchain-repo') -> None:
+        # choose your embedding model
+        # Option 1: HuggingFace
+        # self.embedding_client = InferenceClient(
+        #     "dunzhang/stella_en_1.5B_v5",
+        #      token=os.environ['HF_TOKEN'],
+        # )
+        # Option 2: openAI
+        self.embedding_client = OpenAIEmbeddings(api_key=os.environ.get('OPENAI_API_KEY'))
+        self.index_name = index_name
+        self.pinecone_client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
+        # Create your index if it doesn't exist. Use the create_index function.
+        # Make sure to choose the dimension that corresponds to your embedding model
+        if index_name not in self.pinecone_client.list_indexes().names():
+            self.pinecone_client.create_index(
+                name=index_name,
+                dimension=1536, # length of embedded vectors, created by OpenAI embedding
+                metric="cosine", # search for similar vectors in database, will be based on cosine similarity
+                spec=ServerlessSpec(
+                    cloud="aws",
+                    region="us-east-1"
+                )
+            )
+        # Instantiate index attribute in the class
+        self.index = self.pinecone_client.Index(self.index_name)
+        # Instantiate pinecone vector store
+        self.source_index = self.get_source_index()
+    def get_source_index(self):
+        if not os.path.isfile(self.source_file):
+            print('No source file')
+            return None
+        print('create source index')
+        with open(self.source_file, 'r') as file:
+            sources = file.readlines()
+        sources = [s.rstrip('\n') for s in sources]
+        vectorstore = Chroma.from_texts(
+            sources, embedding=self.embedding_client
+        )
+        return vectorstore
+    def index_data(self, docs, batch_size=32):
+        with open(self.source_file, 'a') as file:
+            for doc in docs:
+                file.writelines(doc.metadata['source'] + '\n')
+        for i in range(0, len(docs), batch_size):
+            batch = docs[i: i + batch_size]
+            # Create a list of the vector representations of each text data in the batch
+            # Choose your embedding model
+            values = self.embedding_client.embed_documents([
+                doc.page_content for doc in batch
+            ])
+            # values = self.embedding_client.feature_extraction([
+            #     doc.page_content for doc in batch
+            # ])
+            # Create a list of unique identifiers for each element in the batch with the uuid package.
+            vector_ids = [str(uuid.uuid4()) for doc in batch]
+            # Create a list of dictionaries representing the metadata. Capture the text data
+            # with the "text" key, and make sure to capture the rest of the doc.metadata.
+            metadatas = [{
+                # Add document content to metadata
+                "text":doc.page_content,
+                **doc.metadata
+            } for doc in batch]
+            # create a list of dictionaries with keys "id" (the unique identifiers), "values"
+            # (the vector representation), and "metadata" (the metadata).
+            vectors = [{
+                'id': vector_id,
+                'values': value,
+                'metadata': metadata
+            } for vector_id, value, metadata in zip(vector_ids, values, metadatas)]
+            try:
+                # Use the function upsert to upload the data to the database.
+                upsert_response = self.index.upsert(vectors=vectors)
+                print(upsert_response)
+            except Exception as e:
+                print(e)
+    def search(self, text_query, top_k=5, hybrid_search=False):
+        filter = None
+        if hybrid_search and self.source_index:
+            # I implemented the filtering process to pull the 50 most relevant file names
+            # to the question. Make sure to adjust this number as you see fit.
+            source_docs = self.source_index.similarity_search(text_query, 50)
+            filter = {"source": {"$in":[doc.page_content for doc in source_docs]}}
+        # Embed the text_query by using the embedding model
+        # Choose your embedding model
+        # vector = self.embedding_client.feature_extraction(text_query)
+        vector = self.embedding_client.embed_query(text_query)
+         # Use the vector representation of the text_query to
+         # search the database by using the query function.
+        result = self.index.query(
+            vector=vector,
+            filter=filter,
+            top_k=top_k,
+            include_metadata=True)
+        docs = []
+        for res in result["matches"]:
+            # From the result's metadata, extract the "text" element.
+            docs.append(res.metadata.text)
+        return docs
+if __name__ == '__main__':
+    from langchain_community.document_loaders import GitLoader
+    from langchain_text_splitters import (
+        Language,
+        RecursiveCharacterTextSplitter,
+    )
+    loader = GitLoader(
+        clone_url="https://github.com/langchain-ai/langchain",
+        repo_path="./code_data/langchain_repo/",
+        branch="master",
+    )
+    python_splitter = RecursiveCharacterTextSplitter.from_language(
+        language=Language.PYTHON, chunk_size=10000, chunk_overlap=100
+    )
+    docs = loader.load()
+    docs = [doc for doc in docs if doc.metadata['file_type'] in ['.py', '.md']]
+    docs = [doc for doc in docs if len(doc.page_content) < 50000]
+    docs = python_splitter.split_documents(docs)
+    for doc in docs:
+        doc.page_content = '# {}\n\n'.format(doc.metadata['source']) + doc.page_content
+    indexer = DataIndexer()
+    #with open('/app/sources.txt', 'a') as file:
+    with open('./sources.txt', 'a') as file:
+        for doc in docs:
+            file.writelines(doc.metadata['source'] + '\n')
+    indexer.index_data(docs)

app_stream_rag/app/main.py CHANGED Viewed

@@ -13,7 +13,8 @@ import schemas
 from chains import (
     simple_chain,
     formatted_chain,
-    history_chain
 )
 import models
@@ -115,6 +116,37 @@ async def history_stream(request: Request, db: Session = Depends(get_db)):
         callbacks=[LogResponseCallback(user_request=user_request, db=db)]
         ))
 if __name__ == "__main__":
     import uvicorn

 from chains import (
     simple_chain,
     formatted_chain,
+    history_chain,
+    rag_chain
 )
 import models
         callbacks=[LogResponseCallback(user_request=user_request, db=db)]
         ))
+@app.post("/rag/stream")
+async def rag_stream(request: Request, db: Session = Depends(get_db)):
+    # Receive request that had hit the endpoint
+    data = await request.json()
+    # Parse request into a user request
+    user_request = schemas.UserRequest(**data['input'])
+    username = user_request.username
+    question = user_request.question
+    # Pull the chat history of the user based on the user request
+    user_messages = crud.get_user_chat_history(db, username)
+    # Use add_message & add the current question as part of the user history
+    message = schemas.MessageBase(
+        message=question,
+        type= "user",
+        timestamp=datetime.now()
+    )
+    crud.add_message(db, message, username)
+    # create an instance of HistoryInput by using format_chat_history
+    user_chat_history = prompts.format_chat_history(user_messages)
+    history_input = schemas.HistoryInput(question=question, chat_history=user_chat_history)
+    # Use the history input within the rag chain
+    return EventSourceResponse(generate_stream(
+        history_input,
+        rag_chain,
+        callbacks=[LogResponseCallback(user_request=user_request, db=db)]
+        ))
 if __name__ == "__main__":
     import uvicorn

app_stream_rag/app/prompts.py CHANGED Viewed

@@ -50,6 +50,15 @@ def format_chat_history(messages: List[models.Message]):
         ) for message in ordered_messages
     ])
 # Create the history_prompt prompt that will capture the question and the conversation history.
 # The history_prompt needs a {chat_history} placeholder and a {question} placeholder.
 history_prompt: str = """
@@ -63,6 +72,34 @@ helpful answer:
 # Apply format_prompt to create history_prompt_formatted
 history_prompt_formatted: PromptTemplate = format_prompt(history_prompt)

         ) for message in ordered_messages
     ])
+def format_context(docs: List[str]):
+    # Output of the DataIndexer.search is a list of text,
+    # so we need to concatenate that list into a text that can fit into
+    # the rag_prompt_formatted. Implement format_context that takes a
+    # like of strings and returns the context as one string.
+    return '\n\n'.join(docs)
 # Create the history_prompt prompt that will capture the question and the conversation history.
 # The history_prompt needs a {chat_history} placeholder and a {question} placeholder.
 history_prompt: str = """
 # Apply format_prompt to create history_prompt_formatted
 history_prompt_formatted: PromptTemplate = format_prompt(history_prompt)
+# Create the standalone_prompt prompt that will capture the question and the chat history
+# to generate a standalone question. It needs a {chat_history} placeholder and a {question} placeholder,
+standalone_prompt: str = """
+Given the following conversation and a follow up question, rephrase the
+follow up question to be a standalone question, in its original language.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:
+"""
+# Use format_prompt to create standalone_prompt_formatted
+standalone_prompt_formatted: PromptTemplate = format_prompt(standalone_prompt)
+# Create the rag_prompt that will capture the context and the standalone question to generate
+# a final answer to the question.
+rag_prompt: str = """
+Answer the question based only on the following context:
+{context}
+Question: {standalone_question}
+"""
+# Use format_prompt to create rag_prompt_formatted
+rag_prompt_formatted: PromptTemplate = format_prompt(rag_prompt)

app_stream_rag/app/test_components.ipynb CHANGED Viewed

@@ -441,6 +441,104 @@
     "results.all()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "results.all()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "url = \"https://simafarazi-backend-c.hf.space/users\"\n",
+    "response = requests.get(url)\n",
+    "print(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Based on various job market analytics and industry trends, Machine Learning Engineers have a slightly higher demand than Data Engineers. According to Glassdoor, the job search platform, Machine Learning Engineer job postings have increased by 45% in the past two years, while Data Engineer job postings have increased by 30% during the same period.\n",
+      "\n",
+      "Additionally, job search platforms like Indeed and LinkedIn also report a higher demand for Machine Learning Engineers. According to Indeed, Machine Learning Engineer job postings have increased by 50% in the past year, while Data Engineer job postings have increased by 25% during the same period.\n",
+      "\n",
+      "There are several reasons contributing to the higher demand for Machine Learning Engineers, including:\n",
+      "\n",
+      "1. The growing use of AI and machine learning technologies across various industries, leading to an increased need for skilled professionals to develop and deploy these models.\n",
+      "2. The increasing amount of data being generated, which requires more advanced data processing and analysis capabilities, making Data Engineers in high demand.\n",
+      "3. The need for companies to stay competitive and innovative, driving the demand for Machine Learning Engineers who can help them develop cutting-edge solutions.\n",
+      "\n",
+      "However, it's essential to note that both Machine Learning Engineers and Data Engineers are in high demand, and the demand for these roles is expected to continue growing in the coming years."
+     ]
+    }
+   ],
+   "source": [
+    "from langserve import RemoteRunnable\n",
+    "# Hit our enpoint with specified rout\n",
+    "# If we put /simple/stream, it complains; because chain.stream will hit /simple/stream endpoint\n",
+    "url = \"https://simafarazi-backend-c.hf.space/history\"\n",
+    "chain = RemoteRunnable(url) #Client for iteracting with LangChain runnables that are hosted as LangServe endpoints\n",
+    "stream = chain.stream(input={\"question\":\"Among these 2 jobs which one has higher demand?\", \n",
+    "                             \"username\": \"Sima\"}) # .stream() and .invoke() are standard methods to interact with hosted runnables\n",
+    "\n",
+    "\n",
+    "for chunk in stream: # Each chunk corresponds to a token/word\n",
+    "     #end=\"\": prints worlds one after each other, an not in a separate lines\n",
+    "     #flush=True: prints world to the screen immidiately without any buffer\n",
+    "     print(chunk, end=\"\", flush=True) \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "IndentationError",
+     "evalue": "unexpected indent (4236902034.py, line 2)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Cell \u001b[0;32mIn[1], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m    from langchain_text_splitters import (\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unexpected indent\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_community.document_loaders import GitLoader\n",
+    "from langchain_text_splitters import (\n",
+    "    Language,\n",
+    "    RecursiveCharacterTextSplitter,\n",
+    ")\n",
+    "\n",
+    "loader = GitLoader(\n",
+    "    clone_url=\"https://github.com/langchain-ai/langchain\",\n",
+    "    repo_path=\"./code_data/langchain_repo/\",\n",
+    "    branch=\"master\",\n",
+    ")\n",
+    "\n",
+    "python_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+    "    language=Language.PYTHON, chunk_size=10000, chunk_overlap=100\n",
+    ")\n",
+    "\n",
+    "docs = loader.load()\n",
+    "docs = [doc for doc in docs if doc.metadata['file_type'] in ['.py', '.md']]\n",
+    "docs = [doc for doc in docs if len(doc.page_content) < 50000]\n",
+    "docs = python_splitter.split_documents(docs)\n",
+    "for doc in docs:\n",
+    "    doc.page_content = '# {}\\n\\n'.format(doc.metadata['source']) + doc.page_content"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,