SimaFarazi commited on
Commit
dfe66cd
·
1 Parent(s): 605fc97

add data indexing file and required modifications in the other files

Browse files
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  .*
2
  !/.gitignore
3
- *.db
 
 
 
1
  .*
2
  !/.gitignore
3
+ *.db
4
+ code_data
5
+ sources.txt
app_stream_rag/app/chains.py CHANGED
@@ -4,13 +4,22 @@ from prompts import (
4
  raw_prompt,
5
  tokenizer,
6
  raw_prompt_formatted,
7
- history_prompt_formatted
 
 
 
8
  )
9
  import schemas
10
 
11
  from dotenv import load_dotenv
12
  load_dotenv()
13
 
 
 
 
 
 
 
14
  # Instantiate HuggingFace endpoint with Llama model
15
  llm = HuggingFaceEndpoint(
16
  repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
@@ -30,3 +39,22 @@ formatted_chain = (raw_prompt_formatted | llm).with_types(input_type=schemas.Use
30
  # Create history_chain by piping raw_prompt_formatted and the LLM endpoint.
31
  history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  raw_prompt,
5
  tokenizer,
6
  raw_prompt_formatted,
7
+ history_prompt_formatted,
8
+ standalone_prompt_formatted,
9
+ rag_prompt_formatted,
10
+ format_context
11
  )
12
  import schemas
13
 
14
  from dotenv import load_dotenv
15
  load_dotenv()
16
 
17
+ from langchain_core.runnables import RunnablePassthrough
18
+
19
+ from data_indexing import DataIndexer
20
+
21
+ data_indexer = DataIndexer()
22
+
23
  # Instantiate HuggingFace endpoint with Llama model
24
  llm = HuggingFaceEndpoint(
25
  repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
 
39
  # Create history_chain by piping raw_prompt_formatted and the LLM endpoint.
40
  history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
41
 
42
+ # Construct the standalone_chain by piping standalone_prompt_formatted with the LLM
43
+ standalone_chain = (standalone_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
44
+
45
+
46
+ input_1 = RunnablePassthrough.assign(new_question=standalone_chain)
47
+ input_2 = {
48
+ 'context': lambda x: format_context(data_indexer.search(x['new_question'])),
49
+ 'standalone_question': lambda x: x['new_question']
50
+ }
51
+ input_to_rag_chain = input_1 | input_2
52
+
53
+ # Use input_to_rag_chain, rag_prompt_formatted,
54
+ # HistoryInput and the LLM to build the rag_chain.
55
+ rag_chain = (input_to_rag_chain | rag_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
56
+
57
+ # TODO: Implement the filtered_rag_chain. It should be the
58
+ # same as the rag_chain but with hybrid_search = True.
59
+ filtered_rag_chain = None
60
+
app_stream_rag/app/data_indexing.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv, find_dotenv
3
+ load_dotenv(find_dotenv())
4
+
5
+ import uuid
6
+ from pathlib import Path
7
+ from pinecone.grpc import PineconeGRPC as Pinecone
8
+ from pinecone import ServerlessSpec
9
+ from langchain_openai import OpenAIEmbeddings
10
+ from langchain_community.vectorstores import Chroma
11
+
12
+
13
+ current_dir = Path(__file__).resolve().parent
14
+
15
+
16
+ class DataIndexer:
17
+
18
+ source_file = os.path.join(current_dir, 'sources.txt')
19
+
20
+ def __init__(self, index_name='langchain-repo') -> None:
21
+
22
+ # choose your embedding model
23
+ # Option 1: HuggingFace
24
+ # self.embedding_client = InferenceClient(
25
+ # "dunzhang/stella_en_1.5B_v5",
26
+ # token=os.environ['HF_TOKEN'],
27
+ # )
28
+ # Option 2: openAI
29
+ self.embedding_client = OpenAIEmbeddings(api_key=os.environ.get('OPENAI_API_KEY'))
30
+ self.index_name = index_name
31
+ self.pinecone_client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
32
+
33
+
34
+ # Create your index if it doesn't exist. Use the create_index function.
35
+ # Make sure to choose the dimension that corresponds to your embedding model
36
+ if index_name not in self.pinecone_client.list_indexes().names():
37
+ self.pinecone_client.create_index(
38
+ name=index_name,
39
+ dimension=1536, # length of embedded vectors, created by OpenAI embedding
40
+ metric="cosine", # search for similar vectors in database, will be based on cosine similarity
41
+ spec=ServerlessSpec(
42
+ cloud="aws",
43
+ region="us-east-1"
44
+ )
45
+ )
46
+ # Instantiate index attribute in the class
47
+ self.index = self.pinecone_client.Index(self.index_name)
48
+ # Instantiate pinecone vector store
49
+ self.source_index = self.get_source_index()
50
+
51
+ def get_source_index(self):
52
+ if not os.path.isfile(self.source_file):
53
+ print('No source file')
54
+ return None
55
+
56
+ print('create source index')
57
+
58
+ with open(self.source_file, 'r') as file:
59
+ sources = file.readlines()
60
+
61
+ sources = [s.rstrip('\n') for s in sources]
62
+ vectorstore = Chroma.from_texts(
63
+ sources, embedding=self.embedding_client
64
+ )
65
+ return vectorstore
66
+
67
+ def index_data(self, docs, batch_size=32):
68
+
69
+ with open(self.source_file, 'a') as file:
70
+ for doc in docs:
71
+ file.writelines(doc.metadata['source'] + '\n')
72
+
73
+ for i in range(0, len(docs), batch_size):
74
+ batch = docs[i: i + batch_size]
75
+
76
+ # Create a list of the vector representations of each text data in the batch
77
+
78
+ # Choose your embedding model
79
+ values = self.embedding_client.embed_documents([
80
+ doc.page_content for doc in batch
81
+ ])
82
+
83
+ # values = self.embedding_client.feature_extraction([
84
+ # doc.page_content for doc in batch
85
+ # ])
86
+
87
+ # Create a list of unique identifiers for each element in the batch with the uuid package.
88
+ vector_ids = [str(uuid.uuid4()) for doc in batch]
89
+
90
+ # Create a list of dictionaries representing the metadata. Capture the text data
91
+ # with the "text" key, and make sure to capture the rest of the doc.metadata.
92
+ metadatas = [{
93
+ # Add document content to metadata
94
+ "text":doc.page_content,
95
+ **doc.metadata
96
+ } for doc in batch]
97
+
98
+ # create a list of dictionaries with keys "id" (the unique identifiers), "values"
99
+ # (the vector representation), and "metadata" (the metadata).
100
+ vectors = [{
101
+ 'id': vector_id,
102
+ 'values': value,
103
+ 'metadata': metadata
104
+ } for vector_id, value, metadata in zip(vector_ids, values, metadatas)]
105
+
106
+ try:
107
+ # Use the function upsert to upload the data to the database.
108
+ upsert_response = self.index.upsert(vectors=vectors)
109
+ print(upsert_response)
110
+ except Exception as e:
111
+ print(e)
112
+
113
+ def search(self, text_query, top_k=5, hybrid_search=False):
114
+
115
+ filter = None
116
+ if hybrid_search and self.source_index:
117
+ # I implemented the filtering process to pull the 50 most relevant file names
118
+ # to the question. Make sure to adjust this number as you see fit.
119
+ source_docs = self.source_index.similarity_search(text_query, 50)
120
+ filter = {"source": {"$in":[doc.page_content for doc in source_docs]}}
121
+
122
+ # Embed the text_query by using the embedding model
123
+ # Choose your embedding model
124
+ # vector = self.embedding_client.feature_extraction(text_query)
125
+ vector = self.embedding_client.embed_query(text_query)
126
+
127
+ # Use the vector representation of the text_query to
128
+ # search the database by using the query function.
129
+ result = self.index.query(
130
+ vector=vector,
131
+ filter=filter,
132
+ top_k=top_k,
133
+ include_metadata=True)
134
+
135
+ docs = []
136
+ for res in result["matches"]:
137
+ # From the result's metadata, extract the "text" element.
138
+ docs.append(res.metadata.text)
139
+
140
+ return docs
141
+
142
+
143
+ if __name__ == '__main__':
144
+
145
+ from langchain_community.document_loaders import GitLoader
146
+ from langchain_text_splitters import (
147
+ Language,
148
+ RecursiveCharacterTextSplitter,
149
+ )
150
+
151
+ loader = GitLoader(
152
+ clone_url="https://github.com/langchain-ai/langchain",
153
+ repo_path="./code_data/langchain_repo/",
154
+ branch="master",
155
+ )
156
+
157
+ python_splitter = RecursiveCharacterTextSplitter.from_language(
158
+ language=Language.PYTHON, chunk_size=10000, chunk_overlap=100
159
+ )
160
+
161
+ docs = loader.load()
162
+ docs = [doc for doc in docs if doc.metadata['file_type'] in ['.py', '.md']]
163
+ docs = [doc for doc in docs if len(doc.page_content) < 50000]
164
+ docs = python_splitter.split_documents(docs)
165
+ for doc in docs:
166
+ doc.page_content = '# {}\n\n'.format(doc.metadata['source']) + doc.page_content
167
+
168
+ indexer = DataIndexer()
169
+ #with open('/app/sources.txt', 'a') as file:
170
+ with open('./sources.txt', 'a') as file:
171
+ for doc in docs:
172
+ file.writelines(doc.metadata['source'] + '\n')
173
+ indexer.index_data(docs)
app_stream_rag/app/main.py CHANGED
@@ -13,7 +13,8 @@ import schemas
13
  from chains import (
14
  simple_chain,
15
  formatted_chain,
16
- history_chain
 
17
  )
18
 
19
  import models
@@ -115,6 +116,37 @@ async def history_stream(request: Request, db: Session = Depends(get_db)):
115
  callbacks=[LogResponseCallback(user_request=user_request, db=db)]
116
  ))
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  if __name__ == "__main__":
120
  import uvicorn
 
13
  from chains import (
14
  simple_chain,
15
  formatted_chain,
16
+ history_chain,
17
+ rag_chain
18
  )
19
 
20
  import models
 
116
  callbacks=[LogResponseCallback(user_request=user_request, db=db)]
117
  ))
118
 
119
+ @app.post("/rag/stream")
120
+ async def rag_stream(request: Request, db: Session = Depends(get_db)):
121
+ # Receive request that had hit the endpoint
122
+ data = await request.json()
123
+ # Parse request into a user request
124
+ user_request = schemas.UserRequest(**data['input'])
125
+ username = user_request.username
126
+ question = user_request.question
127
+
128
+ # Pull the chat history of the user based on the user request
129
+ user_messages = crud.get_user_chat_history(db, username)
130
+
131
+ # Use add_message & add the current question as part of the user history
132
+ message = schemas.MessageBase(
133
+ message=question,
134
+ type= "user",
135
+ timestamp=datetime.now()
136
+ )
137
+
138
+ crud.add_message(db, message, username)
139
+
140
+ # create an instance of HistoryInput by using format_chat_history
141
+ user_chat_history = prompts.format_chat_history(user_messages)
142
+ history_input = schemas.HistoryInput(question=question, chat_history=user_chat_history)
143
+
144
+ # Use the history input within the rag chain
145
+ return EventSourceResponse(generate_stream(
146
+ history_input,
147
+ rag_chain,
148
+ callbacks=[LogResponseCallback(user_request=user_request, db=db)]
149
+ ))
150
 
151
  if __name__ == "__main__":
152
  import uvicorn
app_stream_rag/app/prompts.py CHANGED
@@ -50,6 +50,15 @@ def format_chat_history(messages: List[models.Message]):
50
  ) for message in ordered_messages
51
  ])
52
 
 
 
 
 
 
 
 
 
 
53
  # Create the history_prompt prompt that will capture the question and the conversation history.
54
  # The history_prompt needs a {chat_history} placeholder and a {question} placeholder.
55
  history_prompt: str = """
@@ -63,6 +72,34 @@ helpful answer:
63
  # Apply format_prompt to create history_prompt_formatted
64
  history_prompt_formatted: PromptTemplate = format_prompt(history_prompt)
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
 
 
50
  ) for message in ordered_messages
51
  ])
52
 
53
+
54
+ def format_context(docs: List[str]):
55
+ # Output of the DataIndexer.search is a list of text,
56
+ # so we need to concatenate that list into a text that can fit into
57
+ # the rag_prompt_formatted. Implement format_context that takes a
58
+ # like of strings and returns the context as one string.
59
+ return '\n\n'.join(docs)
60
+
61
+
62
  # Create the history_prompt prompt that will capture the question and the conversation history.
63
  # The history_prompt needs a {chat_history} placeholder and a {question} placeholder.
64
  history_prompt: str = """
 
72
  # Apply format_prompt to create history_prompt_formatted
73
  history_prompt_formatted: PromptTemplate = format_prompt(history_prompt)
74
 
75
+ # Create the standalone_prompt prompt that will capture the question and the chat history
76
+ # to generate a standalone question. It needs a {chat_history} placeholder and a {question} placeholder,
77
+ standalone_prompt: str = """
78
+ Given the following conversation and a follow up question, rephrase the
79
+ follow up question to be a standalone question, in its original language.
80
+
81
+ Chat History:
82
+ {chat_history}
83
+
84
+ Follow Up Input: {question}
85
+
86
+ Standalone question:
87
+ """
88
+
89
+ # Use format_prompt to create standalone_prompt_formatted
90
+ standalone_prompt_formatted: PromptTemplate = format_prompt(standalone_prompt)
91
+
92
+
93
+ # Create the rag_prompt that will capture the context and the standalone question to generate
94
+ # a final answer to the question.
95
+ rag_prompt: str = """
96
+ Answer the question based only on the following context:
97
+ {context}
98
+ Question: {standalone_question}
99
+ """
100
+
101
+ # Use format_prompt to create rag_prompt_formatted
102
+ rag_prompt_formatted: PromptTemplate = format_prompt(rag_prompt)
103
 
104
 
105
 
app_stream_rag/app/test_components.ipynb CHANGED
@@ -441,6 +441,104 @@
441
  "results.all()"
442
  ]
443
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  {
445
  "cell_type": "code",
446
  "execution_count": null,
 
441
  "results.all()"
442
  ]
443
  },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": 1,
447
+ "metadata": {},
448
+ "outputs": [
449
+ {
450
+ "name": "stdout",
451
+ "output_type": "stream",
452
+ "text": [
453
+ "[]\n"
454
+ ]
455
+ }
456
+ ],
457
+ "source": [
458
+ "import requests\n",
459
+ "url = \"https://simafarazi-backend-c.hf.space/users\"\n",
460
+ "response = requests.get(url)\n",
461
+ "print(response.json())"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": 4,
467
+ "metadata": {},
468
+ "outputs": [
469
+ {
470
+ "name": "stdout",
471
+ "output_type": "stream",
472
+ "text": [
473
+ "Based on various job market analytics and industry trends, Machine Learning Engineers have a slightly higher demand than Data Engineers. According to Glassdoor, the job search platform, Machine Learning Engineer job postings have increased by 45% in the past two years, while Data Engineer job postings have increased by 30% during the same period.\n",
474
+ "\n",
475
+ "Additionally, job search platforms like Indeed and LinkedIn also report a higher demand for Machine Learning Engineers. According to Indeed, Machine Learning Engineer job postings have increased by 50% in the past year, while Data Engineer job postings have increased by 25% during the same period.\n",
476
+ "\n",
477
+ "There are several reasons contributing to the higher demand for Machine Learning Engineers, including:\n",
478
+ "\n",
479
+ "1. The growing use of AI and machine learning technologies across various industries, leading to an increased need for skilled professionals to develop and deploy these models.\n",
480
+ "2. The increasing amount of data being generated, which requires more advanced data processing and analysis capabilities, making Data Engineers in high demand.\n",
481
+ "3. The need for companies to stay competitive and innovative, driving the demand for Machine Learning Engineers who can help them develop cutting-edge solutions.\n",
482
+ "\n",
483
+ "However, it's essential to note that both Machine Learning Engineers and Data Engineers are in high demand, and the demand for these roles is expected to continue growing in the coming years."
484
+ ]
485
+ }
486
+ ],
487
+ "source": [
488
+ "from langserve import RemoteRunnable\n",
489
+ "# Hit our enpoint with specified rout\n",
490
+ "# If we put /simple/stream, it complains; because chain.stream will hit /simple/stream endpoint\n",
491
+ "url = \"https://simafarazi-backend-c.hf.space/history\"\n",
492
+ "chain = RemoteRunnable(url) #Client for iteracting with LangChain runnables that are hosted as LangServe endpoints\n",
493
+ "stream = chain.stream(input={\"question\":\"Among these 2 jobs which one has higher demand?\", \n",
494
+ " \"username\": \"Sima\"}) # .stream() and .invoke() are standard methods to interact with hosted runnables\n",
495
+ "\n",
496
+ "\n",
497
+ "for chunk in stream: # Each chunk corresponds to a token/word\n",
498
+ " #end=\"\": prints worlds one after each other, an not in a separate lines\n",
499
+ " #flush=True: prints world to the screen immidiately without any buffer\n",
500
+ " print(chunk, end=\"\", flush=True) \n"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "code",
505
+ "execution_count": 1,
506
+ "metadata": {},
507
+ "outputs": [
508
+ {
509
+ "ename": "IndentationError",
510
+ "evalue": "unexpected indent (4236902034.py, line 2)",
511
+ "output_type": "error",
512
+ "traceback": [
513
+ "\u001b[0;36m Cell \u001b[0;32mIn[1], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m from langchain_text_splitters import (\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unexpected indent\n"
514
+ ]
515
+ }
516
+ ],
517
+ "source": [
518
+ "from langchain_community.document_loaders import GitLoader\n",
519
+ "from langchain_text_splitters import (\n",
520
+ " Language,\n",
521
+ " RecursiveCharacterTextSplitter,\n",
522
+ ")\n",
523
+ "\n",
524
+ "loader = GitLoader(\n",
525
+ " clone_url=\"https://github.com/langchain-ai/langchain\",\n",
526
+ " repo_path=\"./code_data/langchain_repo/\",\n",
527
+ " branch=\"master\",\n",
528
+ ")\n",
529
+ "\n",
530
+ "python_splitter = RecursiveCharacterTextSplitter.from_language(\n",
531
+ " language=Language.PYTHON, chunk_size=10000, chunk_overlap=100\n",
532
+ ")\n",
533
+ "\n",
534
+ "docs = loader.load()\n",
535
+ "docs = [doc for doc in docs if doc.metadata['file_type'] in ['.py', '.md']]\n",
536
+ "docs = [doc for doc in docs if len(doc.page_content) < 50000]\n",
537
+ "docs = python_splitter.split_documents(docs)\n",
538
+ "for doc in docs:\n",
539
+ " doc.page_content = '# {}\\n\\n'.format(doc.metadata['source']) + doc.page_content"
540
+ ]
541
+ },
542
  {
543
  "cell_type": "code",
544
  "execution_count": null,