Spaces:
Sleeping
Sleeping
Commit
·
dfe66cd
1
Parent(s):
605fc97
add data indexing file and required modifications in the other files
Browse files- .gitignore +3 -1
- app_stream_rag/app/chains.py +29 -1
- app_stream_rag/app/data_indexing.py +173 -0
- app_stream_rag/app/main.py +33 -1
- app_stream_rag/app/prompts.py +37 -0
- app_stream_rag/app/test_components.ipynb +98 -0
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
.*
|
2 |
!/.gitignore
|
3 |
-
*.db
|
|
|
|
|
|
1 |
.*
|
2 |
!/.gitignore
|
3 |
+
*.db
|
4 |
+
code_data
|
5 |
+
sources.txt
|
app_stream_rag/app/chains.py
CHANGED
@@ -4,13 +4,22 @@ from prompts import (
|
|
4 |
raw_prompt,
|
5 |
tokenizer,
|
6 |
raw_prompt_formatted,
|
7 |
-
history_prompt_formatted
|
|
|
|
|
|
|
8 |
)
|
9 |
import schemas
|
10 |
|
11 |
from dotenv import load_dotenv
|
12 |
load_dotenv()
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# Instantiate HuggingFace endpoint with Llama model
|
15 |
llm = HuggingFaceEndpoint(
|
16 |
repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
|
@@ -30,3 +39,22 @@ formatted_chain = (raw_prompt_formatted | llm).with_types(input_type=schemas.Use
|
|
30 |
# Create history_chain by piping raw_prompt_formatted and the LLM endpoint.
|
31 |
history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
raw_prompt,
|
5 |
tokenizer,
|
6 |
raw_prompt_formatted,
|
7 |
+
history_prompt_formatted,
|
8 |
+
standalone_prompt_formatted,
|
9 |
+
rag_prompt_formatted,
|
10 |
+
format_context
|
11 |
)
|
12 |
import schemas
|
13 |
|
14 |
from dotenv import load_dotenv
|
15 |
load_dotenv()
|
16 |
|
17 |
+
from langchain_core.runnables import RunnablePassthrough
|
18 |
+
|
19 |
+
from data_indexing import DataIndexer
|
20 |
+
|
21 |
+
data_indexer = DataIndexer()
|
22 |
+
|
23 |
# Instantiate HuggingFace endpoint with Llama model
|
24 |
llm = HuggingFaceEndpoint(
|
25 |
repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
|
|
|
39 |
# Create history_chain by piping raw_prompt_formatted and the LLM endpoint.
|
40 |
history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
|
41 |
|
42 |
+
# Construct the standalone_chain by piping standalone_prompt_formatted with the LLM
|
43 |
+
standalone_chain = (standalone_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
|
44 |
+
|
45 |
+
|
46 |
+
input_1 = RunnablePassthrough.assign(new_question=standalone_chain)
|
47 |
+
input_2 = {
|
48 |
+
'context': lambda x: format_context(data_indexer.search(x['new_question'])),
|
49 |
+
'standalone_question': lambda x: x['new_question']
|
50 |
+
}
|
51 |
+
input_to_rag_chain = input_1 | input_2
|
52 |
+
|
53 |
+
# Use input_to_rag_chain, rag_prompt_formatted,
|
54 |
+
# HistoryInput and the LLM to build the rag_chain.
|
55 |
+
rag_chain = (input_to_rag_chain | rag_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
|
56 |
+
|
57 |
+
# TODO: Implement the filtered_rag_chain. It should be the
|
58 |
+
# same as the rag_chain but with hybrid_search = True.
|
59 |
+
filtered_rag_chain = None
|
60 |
+
|
app_stream_rag/app/data_indexing.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv, find_dotenv
|
3 |
+
load_dotenv(find_dotenv())
|
4 |
+
|
5 |
+
import uuid
|
6 |
+
from pathlib import Path
|
7 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
8 |
+
from pinecone import ServerlessSpec
|
9 |
+
from langchain_openai import OpenAIEmbeddings
|
10 |
+
from langchain_community.vectorstores import Chroma
|
11 |
+
|
12 |
+
|
13 |
+
current_dir = Path(__file__).resolve().parent
|
14 |
+
|
15 |
+
|
16 |
+
class DataIndexer:
|
17 |
+
|
18 |
+
source_file = os.path.join(current_dir, 'sources.txt')
|
19 |
+
|
20 |
+
def __init__(self, index_name='langchain-repo') -> None:
|
21 |
+
|
22 |
+
# choose your embedding model
|
23 |
+
# Option 1: HuggingFace
|
24 |
+
# self.embedding_client = InferenceClient(
|
25 |
+
# "dunzhang/stella_en_1.5B_v5",
|
26 |
+
# token=os.environ['HF_TOKEN'],
|
27 |
+
# )
|
28 |
+
# Option 2: openAI
|
29 |
+
self.embedding_client = OpenAIEmbeddings(api_key=os.environ.get('OPENAI_API_KEY'))
|
30 |
+
self.index_name = index_name
|
31 |
+
self.pinecone_client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
|
32 |
+
|
33 |
+
|
34 |
+
# Create your index if it doesn't exist. Use the create_index function.
|
35 |
+
# Make sure to choose the dimension that corresponds to your embedding model
|
36 |
+
if index_name not in self.pinecone_client.list_indexes().names():
|
37 |
+
self.pinecone_client.create_index(
|
38 |
+
name=index_name,
|
39 |
+
dimension=1536, # length of embedded vectors, created by OpenAI embedding
|
40 |
+
metric="cosine", # search for similar vectors in database, will be based on cosine similarity
|
41 |
+
spec=ServerlessSpec(
|
42 |
+
cloud="aws",
|
43 |
+
region="us-east-1"
|
44 |
+
)
|
45 |
+
)
|
46 |
+
# Instantiate index attribute in the class
|
47 |
+
self.index = self.pinecone_client.Index(self.index_name)
|
48 |
+
# Instantiate pinecone vector store
|
49 |
+
self.source_index = self.get_source_index()
|
50 |
+
|
51 |
+
def get_source_index(self):
|
52 |
+
if not os.path.isfile(self.source_file):
|
53 |
+
print('No source file')
|
54 |
+
return None
|
55 |
+
|
56 |
+
print('create source index')
|
57 |
+
|
58 |
+
with open(self.source_file, 'r') as file:
|
59 |
+
sources = file.readlines()
|
60 |
+
|
61 |
+
sources = [s.rstrip('\n') for s in sources]
|
62 |
+
vectorstore = Chroma.from_texts(
|
63 |
+
sources, embedding=self.embedding_client
|
64 |
+
)
|
65 |
+
return vectorstore
|
66 |
+
|
67 |
+
def index_data(self, docs, batch_size=32):
|
68 |
+
|
69 |
+
with open(self.source_file, 'a') as file:
|
70 |
+
for doc in docs:
|
71 |
+
file.writelines(doc.metadata['source'] + '\n')
|
72 |
+
|
73 |
+
for i in range(0, len(docs), batch_size):
|
74 |
+
batch = docs[i: i + batch_size]
|
75 |
+
|
76 |
+
# Create a list of the vector representations of each text data in the batch
|
77 |
+
|
78 |
+
# Choose your embedding model
|
79 |
+
values = self.embedding_client.embed_documents([
|
80 |
+
doc.page_content for doc in batch
|
81 |
+
])
|
82 |
+
|
83 |
+
# values = self.embedding_client.feature_extraction([
|
84 |
+
# doc.page_content for doc in batch
|
85 |
+
# ])
|
86 |
+
|
87 |
+
# Create a list of unique identifiers for each element in the batch with the uuid package.
|
88 |
+
vector_ids = [str(uuid.uuid4()) for doc in batch]
|
89 |
+
|
90 |
+
# Create a list of dictionaries representing the metadata. Capture the text data
|
91 |
+
# with the "text" key, and make sure to capture the rest of the doc.metadata.
|
92 |
+
metadatas = [{
|
93 |
+
# Add document content to metadata
|
94 |
+
"text":doc.page_content,
|
95 |
+
**doc.metadata
|
96 |
+
} for doc in batch]
|
97 |
+
|
98 |
+
# create a list of dictionaries with keys "id" (the unique identifiers), "values"
|
99 |
+
# (the vector representation), and "metadata" (the metadata).
|
100 |
+
vectors = [{
|
101 |
+
'id': vector_id,
|
102 |
+
'values': value,
|
103 |
+
'metadata': metadata
|
104 |
+
} for vector_id, value, metadata in zip(vector_ids, values, metadatas)]
|
105 |
+
|
106 |
+
try:
|
107 |
+
# Use the function upsert to upload the data to the database.
|
108 |
+
upsert_response = self.index.upsert(vectors=vectors)
|
109 |
+
print(upsert_response)
|
110 |
+
except Exception as e:
|
111 |
+
print(e)
|
112 |
+
|
113 |
+
def search(self, text_query, top_k=5, hybrid_search=False):
|
114 |
+
|
115 |
+
filter = None
|
116 |
+
if hybrid_search and self.source_index:
|
117 |
+
# I implemented the filtering process to pull the 50 most relevant file names
|
118 |
+
# to the question. Make sure to adjust this number as you see fit.
|
119 |
+
source_docs = self.source_index.similarity_search(text_query, 50)
|
120 |
+
filter = {"source": {"$in":[doc.page_content for doc in source_docs]}}
|
121 |
+
|
122 |
+
# Embed the text_query by using the embedding model
|
123 |
+
# Choose your embedding model
|
124 |
+
# vector = self.embedding_client.feature_extraction(text_query)
|
125 |
+
vector = self.embedding_client.embed_query(text_query)
|
126 |
+
|
127 |
+
# Use the vector representation of the text_query to
|
128 |
+
# search the database by using the query function.
|
129 |
+
result = self.index.query(
|
130 |
+
vector=vector,
|
131 |
+
filter=filter,
|
132 |
+
top_k=top_k,
|
133 |
+
include_metadata=True)
|
134 |
+
|
135 |
+
docs = []
|
136 |
+
for res in result["matches"]:
|
137 |
+
# From the result's metadata, extract the "text" element.
|
138 |
+
docs.append(res.metadata.text)
|
139 |
+
|
140 |
+
return docs
|
141 |
+
|
142 |
+
|
143 |
+
if __name__ == '__main__':
|
144 |
+
|
145 |
+
from langchain_community.document_loaders import GitLoader
|
146 |
+
from langchain_text_splitters import (
|
147 |
+
Language,
|
148 |
+
RecursiveCharacterTextSplitter,
|
149 |
+
)
|
150 |
+
|
151 |
+
loader = GitLoader(
|
152 |
+
clone_url="https://github.com/langchain-ai/langchain",
|
153 |
+
repo_path="./code_data/langchain_repo/",
|
154 |
+
branch="master",
|
155 |
+
)
|
156 |
+
|
157 |
+
python_splitter = RecursiveCharacterTextSplitter.from_language(
|
158 |
+
language=Language.PYTHON, chunk_size=10000, chunk_overlap=100
|
159 |
+
)
|
160 |
+
|
161 |
+
docs = loader.load()
|
162 |
+
docs = [doc for doc in docs if doc.metadata['file_type'] in ['.py', '.md']]
|
163 |
+
docs = [doc for doc in docs if len(doc.page_content) < 50000]
|
164 |
+
docs = python_splitter.split_documents(docs)
|
165 |
+
for doc in docs:
|
166 |
+
doc.page_content = '# {}\n\n'.format(doc.metadata['source']) + doc.page_content
|
167 |
+
|
168 |
+
indexer = DataIndexer()
|
169 |
+
#with open('/app/sources.txt', 'a') as file:
|
170 |
+
with open('./sources.txt', 'a') as file:
|
171 |
+
for doc in docs:
|
172 |
+
file.writelines(doc.metadata['source'] + '\n')
|
173 |
+
indexer.index_data(docs)
|
app_stream_rag/app/main.py
CHANGED
@@ -13,7 +13,8 @@ import schemas
|
|
13 |
from chains import (
|
14 |
simple_chain,
|
15 |
formatted_chain,
|
16 |
-
history_chain
|
|
|
17 |
)
|
18 |
|
19 |
import models
|
@@ -115,6 +116,37 @@ async def history_stream(request: Request, db: Session = Depends(get_db)):
|
|
115 |
callbacks=[LogResponseCallback(user_request=user_request, db=db)]
|
116 |
))
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
if __name__ == "__main__":
|
120 |
import uvicorn
|
|
|
13 |
from chains import (
|
14 |
simple_chain,
|
15 |
formatted_chain,
|
16 |
+
history_chain,
|
17 |
+
rag_chain
|
18 |
)
|
19 |
|
20 |
import models
|
|
|
116 |
callbacks=[LogResponseCallback(user_request=user_request, db=db)]
|
117 |
))
|
118 |
|
119 |
+
@app.post("/rag/stream")
|
120 |
+
async def rag_stream(request: Request, db: Session = Depends(get_db)):
|
121 |
+
# Receive request that had hit the endpoint
|
122 |
+
data = await request.json()
|
123 |
+
# Parse request into a user request
|
124 |
+
user_request = schemas.UserRequest(**data['input'])
|
125 |
+
username = user_request.username
|
126 |
+
question = user_request.question
|
127 |
+
|
128 |
+
# Pull the chat history of the user based on the user request
|
129 |
+
user_messages = crud.get_user_chat_history(db, username)
|
130 |
+
|
131 |
+
# Use add_message & add the current question as part of the user history
|
132 |
+
message = schemas.MessageBase(
|
133 |
+
message=question,
|
134 |
+
type= "user",
|
135 |
+
timestamp=datetime.now()
|
136 |
+
)
|
137 |
+
|
138 |
+
crud.add_message(db, message, username)
|
139 |
+
|
140 |
+
# create an instance of HistoryInput by using format_chat_history
|
141 |
+
user_chat_history = prompts.format_chat_history(user_messages)
|
142 |
+
history_input = schemas.HistoryInput(question=question, chat_history=user_chat_history)
|
143 |
+
|
144 |
+
# Use the history input within the rag chain
|
145 |
+
return EventSourceResponse(generate_stream(
|
146 |
+
history_input,
|
147 |
+
rag_chain,
|
148 |
+
callbacks=[LogResponseCallback(user_request=user_request, db=db)]
|
149 |
+
))
|
150 |
|
151 |
if __name__ == "__main__":
|
152 |
import uvicorn
|
app_stream_rag/app/prompts.py
CHANGED
@@ -50,6 +50,15 @@ def format_chat_history(messages: List[models.Message]):
|
|
50 |
) for message in ordered_messages
|
51 |
])
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
# Create the history_prompt prompt that will capture the question and the conversation history.
|
54 |
# The history_prompt needs a {chat_history} placeholder and a {question} placeholder.
|
55 |
history_prompt: str = """
|
@@ -63,6 +72,34 @@ helpful answer:
|
|
63 |
# Apply format_prompt to create history_prompt_formatted
|
64 |
history_prompt_formatted: PromptTemplate = format_prompt(history_prompt)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
|
68 |
|
|
|
50 |
) for message in ordered_messages
|
51 |
])
|
52 |
|
53 |
+
|
54 |
+
def format_context(docs: List[str]):
|
55 |
+
# Output of the DataIndexer.search is a list of text,
|
56 |
+
# so we need to concatenate that list into a text that can fit into
|
57 |
+
# the rag_prompt_formatted. Implement format_context that takes a
|
58 |
+
# like of strings and returns the context as one string.
|
59 |
+
return '\n\n'.join(docs)
|
60 |
+
|
61 |
+
|
62 |
# Create the history_prompt prompt that will capture the question and the conversation history.
|
63 |
# The history_prompt needs a {chat_history} placeholder and a {question} placeholder.
|
64 |
history_prompt: str = """
|
|
|
72 |
# Apply format_prompt to create history_prompt_formatted
|
73 |
history_prompt_formatted: PromptTemplate = format_prompt(history_prompt)
|
74 |
|
75 |
+
# Create the standalone_prompt prompt that will capture the question and the chat history
|
76 |
+
# to generate a standalone question. It needs a {chat_history} placeholder and a {question} placeholder,
|
77 |
+
standalone_prompt: str = """
|
78 |
+
Given the following conversation and a follow up question, rephrase the
|
79 |
+
follow up question to be a standalone question, in its original language.
|
80 |
+
|
81 |
+
Chat History:
|
82 |
+
{chat_history}
|
83 |
+
|
84 |
+
Follow Up Input: {question}
|
85 |
+
|
86 |
+
Standalone question:
|
87 |
+
"""
|
88 |
+
|
89 |
+
# Use format_prompt to create standalone_prompt_formatted
|
90 |
+
standalone_prompt_formatted: PromptTemplate = format_prompt(standalone_prompt)
|
91 |
+
|
92 |
+
|
93 |
+
# Create the rag_prompt that will capture the context and the standalone question to generate
|
94 |
+
# a final answer to the question.
|
95 |
+
rag_prompt: str = """
|
96 |
+
Answer the question based only on the following context:
|
97 |
+
{context}
|
98 |
+
Question: {standalone_question}
|
99 |
+
"""
|
100 |
+
|
101 |
+
# Use format_prompt to create rag_prompt_formatted
|
102 |
+
rag_prompt_formatted: PromptTemplate = format_prompt(rag_prompt)
|
103 |
|
104 |
|
105 |
|
app_stream_rag/app/test_components.ipynb
CHANGED
@@ -441,6 +441,104 @@
|
|
441 |
"results.all()"
|
442 |
]
|
443 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
{
|
445 |
"cell_type": "code",
|
446 |
"execution_count": null,
|
|
|
441 |
"results.all()"
|
442 |
]
|
443 |
},
|
444 |
+
{
|
445 |
+
"cell_type": "code",
|
446 |
+
"execution_count": 1,
|
447 |
+
"metadata": {},
|
448 |
+
"outputs": [
|
449 |
+
{
|
450 |
+
"name": "stdout",
|
451 |
+
"output_type": "stream",
|
452 |
+
"text": [
|
453 |
+
"[]\n"
|
454 |
+
]
|
455 |
+
}
|
456 |
+
],
|
457 |
+
"source": [
|
458 |
+
"import requests\n",
|
459 |
+
"url = \"https://simafarazi-backend-c.hf.space/users\"\n",
|
460 |
+
"response = requests.get(url)\n",
|
461 |
+
"print(response.json())"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"cell_type": "code",
|
466 |
+
"execution_count": 4,
|
467 |
+
"metadata": {},
|
468 |
+
"outputs": [
|
469 |
+
{
|
470 |
+
"name": "stdout",
|
471 |
+
"output_type": "stream",
|
472 |
+
"text": [
|
473 |
+
"Based on various job market analytics and industry trends, Machine Learning Engineers have a slightly higher demand than Data Engineers. According to Glassdoor, the job search platform, Machine Learning Engineer job postings have increased by 45% in the past two years, while Data Engineer job postings have increased by 30% during the same period.\n",
|
474 |
+
"\n",
|
475 |
+
"Additionally, job search platforms like Indeed and LinkedIn also report a higher demand for Machine Learning Engineers. According to Indeed, Machine Learning Engineer job postings have increased by 50% in the past year, while Data Engineer job postings have increased by 25% during the same period.\n",
|
476 |
+
"\n",
|
477 |
+
"There are several reasons contributing to the higher demand for Machine Learning Engineers, including:\n",
|
478 |
+
"\n",
|
479 |
+
"1. The growing use of AI and machine learning technologies across various industries, leading to an increased need for skilled professionals to develop and deploy these models.\n",
|
480 |
+
"2. The increasing amount of data being generated, which requires more advanced data processing and analysis capabilities, making Data Engineers in high demand.\n",
|
481 |
+
"3. The need for companies to stay competitive and innovative, driving the demand for Machine Learning Engineers who can help them develop cutting-edge solutions.\n",
|
482 |
+
"\n",
|
483 |
+
"However, it's essential to note that both Machine Learning Engineers and Data Engineers are in high demand, and the demand for these roles is expected to continue growing in the coming years."
|
484 |
+
]
|
485 |
+
}
|
486 |
+
],
|
487 |
+
"source": [
|
488 |
+
"from langserve import RemoteRunnable\n",
|
489 |
+
"# Hit our enpoint with specified rout\n",
|
490 |
+
"# If we put /simple/stream, it complains; because chain.stream will hit /simple/stream endpoint\n",
|
491 |
+
"url = \"https://simafarazi-backend-c.hf.space/history\"\n",
|
492 |
+
"chain = RemoteRunnable(url) #Client for iteracting with LangChain runnables that are hosted as LangServe endpoints\n",
|
493 |
+
"stream = chain.stream(input={\"question\":\"Among these 2 jobs which one has higher demand?\", \n",
|
494 |
+
" \"username\": \"Sima\"}) # .stream() and .invoke() are standard methods to interact with hosted runnables\n",
|
495 |
+
"\n",
|
496 |
+
"\n",
|
497 |
+
"for chunk in stream: # Each chunk corresponds to a token/word\n",
|
498 |
+
" #end=\"\": prints worlds one after each other, an not in a separate lines\n",
|
499 |
+
" #flush=True: prints world to the screen immidiately without any buffer\n",
|
500 |
+
" print(chunk, end=\"\", flush=True) \n"
|
501 |
+
]
|
502 |
+
},
|
503 |
+
{
|
504 |
+
"cell_type": "code",
|
505 |
+
"execution_count": 1,
|
506 |
+
"metadata": {},
|
507 |
+
"outputs": [
|
508 |
+
{
|
509 |
+
"ename": "IndentationError",
|
510 |
+
"evalue": "unexpected indent (4236902034.py, line 2)",
|
511 |
+
"output_type": "error",
|
512 |
+
"traceback": [
|
513 |
+
"\u001b[0;36m Cell \u001b[0;32mIn[1], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m from langchain_text_splitters import (\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unexpected indent\n"
|
514 |
+
]
|
515 |
+
}
|
516 |
+
],
|
517 |
+
"source": [
|
518 |
+
"from langchain_community.document_loaders import GitLoader\n",
|
519 |
+
"from langchain_text_splitters import (\n",
|
520 |
+
" Language,\n",
|
521 |
+
" RecursiveCharacterTextSplitter,\n",
|
522 |
+
")\n",
|
523 |
+
"\n",
|
524 |
+
"loader = GitLoader(\n",
|
525 |
+
" clone_url=\"https://github.com/langchain-ai/langchain\",\n",
|
526 |
+
" repo_path=\"./code_data/langchain_repo/\",\n",
|
527 |
+
" branch=\"master\",\n",
|
528 |
+
")\n",
|
529 |
+
"\n",
|
530 |
+
"python_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
531 |
+
" language=Language.PYTHON, chunk_size=10000, chunk_overlap=100\n",
|
532 |
+
")\n",
|
533 |
+
"\n",
|
534 |
+
"docs = loader.load()\n",
|
535 |
+
"docs = [doc for doc in docs if doc.metadata['file_type'] in ['.py', '.md']]\n",
|
536 |
+
"docs = [doc for doc in docs if len(doc.page_content) < 50000]\n",
|
537 |
+
"docs = python_splitter.split_documents(docs)\n",
|
538 |
+
"for doc in docs:\n",
|
539 |
+
" doc.page_content = '# {}\\n\\n'.format(doc.metadata['source']) + doc.page_content"
|
540 |
+
]
|
541 |
+
},
|
542 |
{
|
543 |
"cell_type": "code",
|
544 |
"execution_count": null,
|