Spaces:
Running
on
Zero
Running
on
Zero
kwabs22
commited on
Commit
·
cd998d9
1
Parent(s):
7e4c949
RAG Placeholder demo test
Browse files
app.py
CHANGED
@@ -76,9 +76,120 @@ from sentence_transformers import SentenceTransformer
|
|
76 |
# yield response, f"{tokens_per_second:.2f}"
|
77 |
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Load the embedding model
|
84 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
@@ -93,31 +204,28 @@ llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
|
93 |
|
94 |
# Sample knowledge base (replace with your own data)
|
95 |
knowledge_base = [
|
96 |
-
"The capital of France is Paris.",
|
97 |
-
"Python is a popular programming language.",
|
98 |
-
"Machine learning is a subset of artificial intelligence.",
|
99 |
-
"The Earth orbits around the Sun.",
|
|
|
100 |
]
|
101 |
|
102 |
# Create embeddings for the knowledge base
|
103 |
-
knowledge_base_embeddings = embedding_model.encode(knowledge_base)
|
104 |
|
105 |
def retrieve(query, k=2):
|
106 |
query_embedding = embedding_model.encode([query])
|
107 |
similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
|
108 |
top_k_indices = similarities.argsort(descending=True)[:k]
|
109 |
-
return [knowledge_base[i] for i in top_k_indices]
|
110 |
|
111 |
-
def
|
112 |
-
|
113 |
-
|
114 |
-
gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3) # Convert to GB
|
115 |
-
return f"RAM Usage: {ram_usage:.2f}%, GPU Memory Allocated: {gpu_memory_allocated:.2f}GB, GPU Memory Reserved: {gpu_memory_reserved:.2f}GB"
|
116 |
|
117 |
@spaces.GPU
|
118 |
-
def llmguide_generate_response(prompt, stream=False):
|
119 |
-
print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
|
120 |
-
|
121 |
messages = [
|
122 |
{"role": "system", "content": "You are a helpful assistant."},
|
123 |
{"role": "user", "content": prompt}
|
@@ -127,7 +235,7 @@ def llmguide_generate_response(prompt, stream=False):
|
|
127 |
tokenize=False,
|
128 |
add_generation_prompt=True
|
129 |
)
|
130 |
-
model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(
|
131 |
|
132 |
start_time = time.time()
|
133 |
total_tokens = 0
|
@@ -149,10 +257,10 @@ def llmguide_generate_response(prompt, stream=False):
|
|
149 |
total_tokens += 1
|
150 |
current_time = time.time()
|
151 |
tokens_per_second = total_tokens / (current_time - start_time)
|
152 |
-
yield generated_text, f"{tokens_per_second:.2f}", ""
|
153 |
|
154 |
-
|
155 |
-
yield generated_text, f"{tokens_per_second:.2f}",
|
156 |
else:
|
157 |
generated_ids = llmguide_model.generate(
|
158 |
model_inputs.input_ids,
|
@@ -165,36 +273,32 @@ def llmguide_generate_response(prompt, stream=False):
|
|
165 |
total_tokens = len(generated_ids[0])
|
166 |
end_time = time.time()
|
167 |
tokens_per_second = total_tokens / (end_time - start_time)
|
168 |
-
|
169 |
-
yield response, f"{tokens_per_second:.2f}",
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
def rag(query, stream=False):
|
182 |
-
retrieved_docs = retrieve(query)
|
183 |
-
context = " ".join(retrieved_docs)
|
184 |
-
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
185 |
|
186 |
-
generator = llmguide_generate_response(prompt, stream)
|
187 |
|
188 |
if stream:
|
189 |
def stream_output():
|
190 |
-
for generated_text, tokens_per_second, ram_usage in generator:
|
191 |
-
yield generated_text, tokens_per_second, ram_usage
|
192 |
return stream_output()
|
193 |
else:
|
194 |
# For non-streaming, we just need to get the final output
|
195 |
-
for generated_text, tokens_per_second, ram_usage in generator:
|
196 |
pass # This will iterate to the last yield
|
197 |
-
return generated_text, tokens_per_second, ram_usage
|
|
|
198 |
|
199 |
#--------------------------------------------------------------------------------------------------------------------------------
|
200 |
|
@@ -838,24 +942,43 @@ with gr.Blocks() as demo:
|
|
838 |
<div style="width: 20%; text-align: center">HF + Gradio allows for api use so this my prototype tool for tool use test</div>
|
839 |
</div>""")
|
840 |
with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
|
841 |
-
gr.HTML("Placeholder for FAQ type - front end as prompt engineering for the first message to force direction of conversion")
|
842 |
gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
843 |
gr.Interface(
|
844 |
-
fn=
|
845 |
inputs=[
|
846 |
gr.Textbox(lines=2, placeholder="Enter your question here..."),
|
|
|
847 |
gr.Checkbox(label="Stream output")
|
848 |
],
|
849 |
outputs=[
|
850 |
gr.Textbox(label="Generated Response"),
|
851 |
gr.Textbox(label="Tokens per second"),
|
852 |
-
gr.Textbox(label="
|
|
|
853 |
],
|
854 |
-
title="RAG Q&A System
|
855 |
-
description="Ask a question
|
856 |
)
|
857 |
-
("Placeholder for
|
858 |
-
|
|
|
|
|
|
|
859 |
gr.Markdown("# Qwen-0.5B-Instruct Language Model")
|
860 |
gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
|
861 |
gr.HTML("Example prompts: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include? <br>What are the kinds problems that a chef story would include? <br>What are the kinds of out of reach goals that a chef story would include? <br>Continue this config - Paste any complete block of the config")
|
|
|
76 |
# yield response, f"{tokens_per_second:.2f}"
|
77 |
|
78 |
|
79 |
+
#---------
|
80 |
+
#----------
|
81 |
+
|
82 |
+
# # Initialize GPU tensor
|
83 |
+
# zero = torch.Tensor([0]).cuda()
|
84 |
+
# print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
|
85 |
+
|
86 |
+
# # Load the embedding model
|
87 |
+
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
88 |
+
|
89 |
+
# # Load the Qwen model and tokenizer
|
90 |
+
# llmguide_model = AutoModelForCausalLM.from_pretrained(
|
91 |
+
# "Qwen/Qwen2-0.5B-Instruct",
|
92 |
+
# torch_dtype="auto",
|
93 |
+
# device_map="auto"
|
94 |
+
# )
|
95 |
+
# llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
96 |
+
|
97 |
+
# # Sample knowledge base (replace with your own data)
|
98 |
+
# knowledge_base = [
|
99 |
+
# "The capital of France is Paris.",
|
100 |
+
# "Python is a popular programming language.",
|
101 |
+
# "Machine learning is a subset of artificial intelligence.",
|
102 |
+
# "The Earth orbits around the Sun.",
|
103 |
+
# "orbits are a group of fans of a music group"
|
104 |
+
# ]
|
105 |
+
|
106 |
+
# # Create embeddings for the knowledge base
|
107 |
+
# knowledge_base_embeddings = embedding_model.encode(knowledge_base)
|
108 |
+
|
109 |
+
# def retrieve(query, k=2):
|
110 |
+
# query_embedding = embedding_model.encode([query])
|
111 |
+
# similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
|
112 |
+
# top_k_indices = similarities.argsort(descending=True)[:k]
|
113 |
+
# return [knowledge_base[i] for i in top_k_indices]
|
114 |
+
|
115 |
+
# def get_resource_usage():
|
116 |
+
# ram_usage = psutil.virtual_memory().percent
|
117 |
+
# gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3) # Convert to GB
|
118 |
+
# gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3) # Convert to GB
|
119 |
+
# return f"RAM Usage: {ram_usage:.2f}%, GPU Memory Allocated: {gpu_memory_allocated:.2f}GB, GPU Memory Reserved: {gpu_memory_reserved:.2f}GB"
|
120 |
+
|
121 |
+
# @spaces.GPU
|
122 |
+
# def llmguide_generate_response(prompt, stream=False):
|
123 |
+
# print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
|
124 |
+
|
125 |
+
# messages = [
|
126 |
+
# {"role": "system", "content": "You are a helpful assistant."},
|
127 |
+
# {"role": "user", "content": prompt}
|
128 |
+
# ]
|
129 |
+
# text = llmguide_tokenizer.apply_chat_template(
|
130 |
+
# messages,
|
131 |
+
# tokenize=False,
|
132 |
+
# add_generation_prompt=True
|
133 |
+
# )
|
134 |
+
# model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(zero.device)
|
135 |
+
|
136 |
+
# start_time = time.time()
|
137 |
+
# total_tokens = 0
|
138 |
+
|
139 |
+
# if stream:
|
140 |
+
# streamer = TextIteratorStreamer(llmguide_tokenizer, skip_special_tokens=True)
|
141 |
+
# generation_kwargs = dict(
|
142 |
+
# model_inputs,
|
143 |
+
# streamer=streamer,
|
144 |
+
# max_new_tokens=512,
|
145 |
+
# temperature=0.7,
|
146 |
+
# )
|
147 |
+
# thread = Thread(target=llmguide_model.generate, kwargs=generation_kwargs)
|
148 |
+
# thread.start()
|
149 |
+
|
150 |
+
# generated_text = ""
|
151 |
+
# for new_text in streamer:
|
152 |
+
# generated_text += new_text
|
153 |
+
# total_tokens += 1
|
154 |
+
# current_time = time.time()
|
155 |
+
# tokens_per_second = total_tokens / (current_time - start_time)
|
156 |
+
# yield generated_text, f"{tokens_per_second:.2f}", ""
|
157 |
+
|
158 |
+
# resource_usage = get_resource_usage()
|
159 |
+
# yield generated_text, f"{tokens_per_second:.2f}", resource_usage
|
160 |
+
# else:
|
161 |
+
# generated_ids = llmguide_model.generate(
|
162 |
+
# model_inputs.input_ids,
|
163 |
+
# max_new_tokens=512
|
164 |
+
# )
|
165 |
+
# generated_ids = [
|
166 |
+
# output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
167 |
+
# ]
|
168 |
+
# response = llmguide_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
169 |
+
# total_tokens = len(generated_ids[0])
|
170 |
+
# end_time = time.time()
|
171 |
+
# tokens_per_second = total_tokens / (end_time - start_time)
|
172 |
+
# resource_usage = get_resource_usage()
|
173 |
+
# yield response, f"{tokens_per_second:.2f}", resource_usage
|
174 |
+
|
175 |
+
# def rag(query, stream=False):
|
176 |
+
# retrieved_docs = retrieve(query)
|
177 |
+
# context = " ".join(retrieved_docs)
|
178 |
+
# prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
179 |
+
|
180 |
+
# generator = llmguide_generate_response(prompt, stream)
|
181 |
+
|
182 |
+
# if stream:
|
183 |
+
# def stream_output():
|
184 |
+
# for generated_text, tokens_per_second, ram_usage in generator:
|
185 |
+
# yield generated_text, tokens_per_second, ram_usage
|
186 |
+
# return stream_output()
|
187 |
+
# else:
|
188 |
+
# # For non-streaming, we just need to get the final output
|
189 |
+
# for generated_text, tokens_per_second, ram_usage in generator:
|
190 |
+
# pass # This will iterate to the last yield
|
191 |
+
# return generated_text, tokens_per_second, ram_usage
|
192 |
+
|
193 |
|
194 |
# Load the embedding model
|
195 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
204 |
|
205 |
# Sample knowledge base (replace with your own data)
|
206 |
knowledge_base = [
|
207 |
+
{"id": "doc1", "content": "The capital of France is Paris."},
|
208 |
+
{"id": "doc2", "content": "Python is a popular programming language."},
|
209 |
+
{"id": "doc3", "content": "Machine learning is a subset of artificial intelligence."},
|
210 |
+
{"id": "doc4", "content": "The Earth orbits around the Sun."},
|
211 |
+
{"id": "doc5", "content": "orbits is the name of a korean fangroup"},
|
212 |
]
|
213 |
|
214 |
# Create embeddings for the knowledge base
|
215 |
+
knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
|
216 |
|
217 |
def retrieve(query, k=2):
|
218 |
query_embedding = embedding_model.encode([query])
|
219 |
similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
|
220 |
top_k_indices = similarities.argsort(descending=True)[:k]
|
221 |
+
return [(knowledge_base[i]["content"], knowledge_base[i]["id"]) for i in top_k_indices]
|
222 |
|
223 |
+
def get_ram_usage():
|
224 |
+
ram = psutil.virtual_memory()
|
225 |
+
return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
|
|
|
|
|
226 |
|
227 |
@spaces.GPU
|
228 |
+
def llmguide_generate_response(prompt, doc_ids=None, stream=False):
|
|
|
|
|
229 |
messages = [
|
230 |
{"role": "system", "content": "You are a helpful assistant."},
|
231 |
{"role": "user", "content": prompt}
|
|
|
235 |
tokenize=False,
|
236 |
add_generation_prompt=True
|
237 |
)
|
238 |
+
model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(llmguide_model.device)
|
239 |
|
240 |
start_time = time.time()
|
241 |
total_tokens = 0
|
|
|
257 |
total_tokens += 1
|
258 |
current_time = time.time()
|
259 |
tokens_per_second = total_tokens / (current_time - start_time)
|
260 |
+
yield generated_text, f"{tokens_per_second:.2f}", "", ", ".join(doc_ids) if doc_ids else "N/A"
|
261 |
|
262 |
+
ram_usage = get_ram_usage()
|
263 |
+
yield generated_text, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
|
264 |
else:
|
265 |
generated_ids = llmguide_model.generate(
|
266 |
model_inputs.input_ids,
|
|
|
273 |
total_tokens = len(generated_ids[0])
|
274 |
end_time = time.time()
|
275 |
tokens_per_second = total_tokens / (end_time - start_time)
|
276 |
+
ram_usage = get_ram_usage()
|
277 |
+
yield response, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
|
278 |
+
|
279 |
+
def process_query(query, use_rag, stream=False):
|
280 |
+
if use_rag:
|
281 |
+
retrieved_docs = retrieve(query)
|
282 |
+
context = " ".join([doc for doc, _ in retrieved_docs])
|
283 |
+
doc_ids = [doc_id for _, doc_id in retrieved_docs]
|
284 |
+
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
285 |
+
else:
|
286 |
+
prompt = query
|
287 |
+
doc_ids = None
|
|
|
|
|
|
|
|
|
|
|
288 |
|
289 |
+
generator = llmguide_generate_response(prompt, doc_ids, stream)
|
290 |
|
291 |
if stream:
|
292 |
def stream_output():
|
293 |
+
for generated_text, tokens_per_second, ram_usage, doc_references in generator:
|
294 |
+
yield generated_text, tokens_per_second, ram_usage, doc_references
|
295 |
return stream_output()
|
296 |
else:
|
297 |
# For non-streaming, we just need to get the final output
|
298 |
+
for generated_text, tokens_per_second, ram_usage, doc_references in generator:
|
299 |
pass # This will iterate to the last yield
|
300 |
+
return generated_text, tokens_per_second, ram_usage, doc_references
|
301 |
+
|
302 |
|
303 |
#--------------------------------------------------------------------------------------------------------------------------------
|
304 |
|
|
|
942 |
<div style="width: 20%; text-align: center">HF + Gradio allows for api use so this my prototype tool for tool use test</div>
|
943 |
</div>""")
|
944 |
with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
|
|
|
945 |
gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
|
946 |
+
# gr.Interface(
|
947 |
+
# fn=rag,
|
948 |
+
# inputs=[
|
949 |
+
# gr.Textbox(lines=2, placeholder="Enter your question here..."),
|
950 |
+
# gr.Checkbox(label="Stream output")
|
951 |
+
# ],
|
952 |
+
# outputs=[
|
953 |
+
# gr.Textbox(label="Generated Response"),
|
954 |
+
# gr.Textbox(label="Tokens per second"),
|
955 |
+
# gr.Textbox(label="Resource Usage")
|
956 |
+
# ],
|
957 |
+
# title="RAG Q&A System with GPU Acceleration and Resource Monitoring",
|
958 |
+
# description="Ask a question and get an answer based on the retrieved context. The response is generated using a GPU-accelerated model. Resource usage is logged at the end of generation."
|
959 |
+
# )
|
960 |
+
|
961 |
gr.Interface(
|
962 |
+
fn=process_query,
|
963 |
inputs=[
|
964 |
gr.Textbox(lines=2, placeholder="Enter your question here..."),
|
965 |
+
gr.Checkbox(label="Use RAG"),
|
966 |
gr.Checkbox(label="Stream output")
|
967 |
],
|
968 |
outputs=[
|
969 |
gr.Textbox(label="Generated Response"),
|
970 |
gr.Textbox(label="Tokens per second"),
|
971 |
+
gr.Textbox(label="RAM Usage"),
|
972 |
+
gr.Textbox(label="Referenced Documents")
|
973 |
],
|
974 |
+
title="RAG/Non-RAG Q&A System",
|
975 |
+
description="Ask a question with or without using RAG. The response is generated using a GPU-accelerated model. RAM usage and referenced document IDs (for RAG) are logged."
|
976 |
)
|
977 |
+
gr.HTML("Placeholder for FAQ type (merge as buttons on the above interface) - front end as prompt engineering for the first message to force direction of conversion")
|
978 |
+
|
979 |
+
|
980 |
+
gr.HTML("Placeholder for https://huggingface.co/h2oai/h2o-danube3-500m-chat-GGUF as alternative")
|
981 |
+
gr.HTML("Placeholder for qwen 2 72b as alternative use checkbox and gradio client api call")
|
982 |
gr.Markdown("# Qwen-0.5B-Instruct Language Model")
|
983 |
gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
|
984 |
gr.HTML("Example prompts: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include? <br>What are the kinds problems that a chef story would include? <br>What are the kinds of out of reach goals that a chef story would include? <br>Continue this config - Paste any complete block of the config")
|