kwabs22 commited on
Commit
cd998d9
·
1 Parent(s): 7e4c949

RAG Placeholder demo test

Browse files
Files changed (1) hide show
  1. app.py +173 -50
app.py CHANGED
@@ -76,9 +76,120 @@ from sentence_transformers import SentenceTransformer
76
  # yield response, f"{tokens_per_second:.2f}"
77
 
78
 
79
- # Initialize GPU tensor
80
- zero = torch.Tensor([0]).cuda()
81
- print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # Load the embedding model
84
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -93,31 +204,28 @@ llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
93
 
94
  # Sample knowledge base (replace with your own data)
95
  knowledge_base = [
96
- "The capital of France is Paris.",
97
- "Python is a popular programming language.",
98
- "Machine learning is a subset of artificial intelligence.",
99
- "The Earth orbits around the Sun.",
 
100
  ]
101
 
102
  # Create embeddings for the knowledge base
103
- knowledge_base_embeddings = embedding_model.encode(knowledge_base)
104
 
105
  def retrieve(query, k=2):
106
  query_embedding = embedding_model.encode([query])
107
  similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
108
  top_k_indices = similarities.argsort(descending=True)[:k]
109
- return [knowledge_base[i] for i in top_k_indices]
110
 
111
- def get_resource_usage():
112
- ram_usage = psutil.virtual_memory().percent
113
- gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3) # Convert to GB
114
- gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3) # Convert to GB
115
- return f"RAM Usage: {ram_usage:.2f}%, GPU Memory Allocated: {gpu_memory_allocated:.2f}GB, GPU Memory Reserved: {gpu_memory_reserved:.2f}GB"
116
 
117
  @spaces.GPU
118
- def llmguide_generate_response(prompt, stream=False):
119
- print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
120
-
121
  messages = [
122
  {"role": "system", "content": "You are a helpful assistant."},
123
  {"role": "user", "content": prompt}
@@ -127,7 +235,7 @@ def llmguide_generate_response(prompt, stream=False):
127
  tokenize=False,
128
  add_generation_prompt=True
129
  )
130
- model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(zero.device)
131
 
132
  start_time = time.time()
133
  total_tokens = 0
@@ -149,10 +257,10 @@ def llmguide_generate_response(prompt, stream=False):
149
  total_tokens += 1
150
  current_time = time.time()
151
  tokens_per_second = total_tokens / (current_time - start_time)
152
- yield generated_text, f"{tokens_per_second:.2f}", ""
153
 
154
- resource_usage = get_resource_usage()
155
- yield generated_text, f"{tokens_per_second:.2f}", resource_usage
156
  else:
157
  generated_ids = llmguide_model.generate(
158
  model_inputs.input_ids,
@@ -165,36 +273,32 @@ def llmguide_generate_response(prompt, stream=False):
165
  total_tokens = len(generated_ids[0])
166
  end_time = time.time()
167
  tokens_per_second = total_tokens / (end_time - start_time)
168
- resource_usage = get_resource_usage()
169
- yield response, f"{tokens_per_second:.2f}", resource_usage
170
-
171
- # Clear CUDA cache
172
- # torch.cuda.empty_cache()
173
- # gc.collect()
174
-
175
- # def rag(query, stream=False):
176
- # retrieved_docs = retrieve(query)
177
- # context = " ".join(retrieved_docs)
178
- # prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
179
- # return llmguide_generate_response(prompt, stream)
180
-
181
- def rag(query, stream=False):
182
- retrieved_docs = retrieve(query)
183
- context = " ".join(retrieved_docs)
184
- prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
185
 
186
- generator = llmguide_generate_response(prompt, stream)
187
 
188
  if stream:
189
  def stream_output():
190
- for generated_text, tokens_per_second, ram_usage in generator:
191
- yield generated_text, tokens_per_second, ram_usage
192
  return stream_output()
193
  else:
194
  # For non-streaming, we just need to get the final output
195
- for generated_text, tokens_per_second, ram_usage in generator:
196
  pass # This will iterate to the last yield
197
- return generated_text, tokens_per_second, ram_usage
 
198
 
199
  #--------------------------------------------------------------------------------------------------------------------------------
200
 
@@ -838,24 +942,43 @@ with gr.Blocks() as demo:
838
  <div style="width: 20%; text-align: center">HF + Gradio allows for api use so this my prototype tool for tool use test</div>
839
  </div>""")
840
  with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
841
- gr.HTML("Placeholder for FAQ type - front end as prompt engineering for the first message to force direction of conversion")
842
  gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
  gr.Interface(
844
- fn=rag,
845
  inputs=[
846
  gr.Textbox(lines=2, placeholder="Enter your question here..."),
 
847
  gr.Checkbox(label="Stream output")
848
  ],
849
  outputs=[
850
  gr.Textbox(label="Generated Response"),
851
  gr.Textbox(label="Tokens per second"),
852
- gr.Textbox(label="Resource Usage")
 
853
  ],
854
- title="RAG Q&A System with GPU Acceleration and Resource Monitoring",
855
- description="Ask a question and get an answer based on the retrieved context. The response is generated using a GPU-accelerated model. Resource usage is logged at the end of generation."
856
  )
857
- ("Placeholder for https://huggingface.co/h2oai/h2o-danube3-500m-chat-GGUF as alternative")
858
- ("Placeholder for qwen 2 72b as alternative use checkbox and gradio client api call")
 
 
 
859
  gr.Markdown("# Qwen-0.5B-Instruct Language Model")
860
  gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
861
  gr.HTML("Example prompts: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include? <br>What are the kinds problems that a chef story would include? <br>What are the kinds of out of reach goals that a chef story would include? <br>Continue this config - Paste any complete block of the config")
 
76
  # yield response, f"{tokens_per_second:.2f}"
77
 
78
 
79
+ #---------
80
+ #----------
81
+
82
+ # # Initialize GPU tensor
83
+ # zero = torch.Tensor([0]).cuda()
84
+ # print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
85
+
86
+ # # Load the embedding model
87
+ # embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
88
+
89
+ # # Load the Qwen model and tokenizer
90
+ # llmguide_model = AutoModelForCausalLM.from_pretrained(
91
+ # "Qwen/Qwen2-0.5B-Instruct",
92
+ # torch_dtype="auto",
93
+ # device_map="auto"
94
+ # )
95
+ # llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
96
+
97
+ # # Sample knowledge base (replace with your own data)
98
+ # knowledge_base = [
99
+ # "The capital of France is Paris.",
100
+ # "Python is a popular programming language.",
101
+ # "Machine learning is a subset of artificial intelligence.",
102
+ # "The Earth orbits around the Sun.",
103
+ # "orbits are a group of fans of a music group"
104
+ # ]
105
+
106
+ # # Create embeddings for the knowledge base
107
+ # knowledge_base_embeddings = embedding_model.encode(knowledge_base)
108
+
109
+ # def retrieve(query, k=2):
110
+ # query_embedding = embedding_model.encode([query])
111
+ # similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
112
+ # top_k_indices = similarities.argsort(descending=True)[:k]
113
+ # return [knowledge_base[i] for i in top_k_indices]
114
+
115
+ # def get_resource_usage():
116
+ # ram_usage = psutil.virtual_memory().percent
117
+ # gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3) # Convert to GB
118
+ # gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3) # Convert to GB
119
+ # return f"RAM Usage: {ram_usage:.2f}%, GPU Memory Allocated: {gpu_memory_allocated:.2f}GB, GPU Memory Reserved: {gpu_memory_reserved:.2f}GB"
120
+
121
+ # @spaces.GPU
122
+ # def llmguide_generate_response(prompt, stream=False):
123
+ # print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
124
+
125
+ # messages = [
126
+ # {"role": "system", "content": "You are a helpful assistant."},
127
+ # {"role": "user", "content": prompt}
128
+ # ]
129
+ # text = llmguide_tokenizer.apply_chat_template(
130
+ # messages,
131
+ # tokenize=False,
132
+ # add_generation_prompt=True
133
+ # )
134
+ # model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(zero.device)
135
+
136
+ # start_time = time.time()
137
+ # total_tokens = 0
138
+
139
+ # if stream:
140
+ # streamer = TextIteratorStreamer(llmguide_tokenizer, skip_special_tokens=True)
141
+ # generation_kwargs = dict(
142
+ # model_inputs,
143
+ # streamer=streamer,
144
+ # max_new_tokens=512,
145
+ # temperature=0.7,
146
+ # )
147
+ # thread = Thread(target=llmguide_model.generate, kwargs=generation_kwargs)
148
+ # thread.start()
149
+
150
+ # generated_text = ""
151
+ # for new_text in streamer:
152
+ # generated_text += new_text
153
+ # total_tokens += 1
154
+ # current_time = time.time()
155
+ # tokens_per_second = total_tokens / (current_time - start_time)
156
+ # yield generated_text, f"{tokens_per_second:.2f}", ""
157
+
158
+ # resource_usage = get_resource_usage()
159
+ # yield generated_text, f"{tokens_per_second:.2f}", resource_usage
160
+ # else:
161
+ # generated_ids = llmguide_model.generate(
162
+ # model_inputs.input_ids,
163
+ # max_new_tokens=512
164
+ # )
165
+ # generated_ids = [
166
+ # output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
167
+ # ]
168
+ # response = llmguide_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
169
+ # total_tokens = len(generated_ids[0])
170
+ # end_time = time.time()
171
+ # tokens_per_second = total_tokens / (end_time - start_time)
172
+ # resource_usage = get_resource_usage()
173
+ # yield response, f"{tokens_per_second:.2f}", resource_usage
174
+
175
+ # def rag(query, stream=False):
176
+ # retrieved_docs = retrieve(query)
177
+ # context = " ".join(retrieved_docs)
178
+ # prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
179
+
180
+ # generator = llmguide_generate_response(prompt, stream)
181
+
182
+ # if stream:
183
+ # def stream_output():
184
+ # for generated_text, tokens_per_second, ram_usage in generator:
185
+ # yield generated_text, tokens_per_second, ram_usage
186
+ # return stream_output()
187
+ # else:
188
+ # # For non-streaming, we just need to get the final output
189
+ # for generated_text, tokens_per_second, ram_usage in generator:
190
+ # pass # This will iterate to the last yield
191
+ # return generated_text, tokens_per_second, ram_usage
192
+
193
 
194
  # Load the embedding model
195
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 
204
 
205
  # Sample knowledge base (replace with your own data)
206
  knowledge_base = [
207
+ {"id": "doc1", "content": "The capital of France is Paris."},
208
+ {"id": "doc2", "content": "Python is a popular programming language."},
209
+ {"id": "doc3", "content": "Machine learning is a subset of artificial intelligence."},
210
+ {"id": "doc4", "content": "The Earth orbits around the Sun."},
211
+ {"id": "doc5", "content": "orbits is the name of a korean fangroup"},
212
  ]
213
 
214
  # Create embeddings for the knowledge base
215
+ knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
216
 
217
  def retrieve(query, k=2):
218
  query_embedding = embedding_model.encode([query])
219
  similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
220
  top_k_indices = similarities.argsort(descending=True)[:k]
221
+ return [(knowledge_base[i]["content"], knowledge_base[i]["id"]) for i in top_k_indices]
222
 
223
+ def get_ram_usage():
224
+ ram = psutil.virtual_memory()
225
+ return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
 
 
226
 
227
  @spaces.GPU
228
+ def llmguide_generate_response(prompt, doc_ids=None, stream=False):
 
 
229
  messages = [
230
  {"role": "system", "content": "You are a helpful assistant."},
231
  {"role": "user", "content": prompt}
 
235
  tokenize=False,
236
  add_generation_prompt=True
237
  )
238
+ model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(llmguide_model.device)
239
 
240
  start_time = time.time()
241
  total_tokens = 0
 
257
  total_tokens += 1
258
  current_time = time.time()
259
  tokens_per_second = total_tokens / (current_time - start_time)
260
+ yield generated_text, f"{tokens_per_second:.2f}", "", ", ".join(doc_ids) if doc_ids else "N/A"
261
 
262
+ ram_usage = get_ram_usage()
263
+ yield generated_text, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
264
  else:
265
  generated_ids = llmguide_model.generate(
266
  model_inputs.input_ids,
 
273
  total_tokens = len(generated_ids[0])
274
  end_time = time.time()
275
  tokens_per_second = total_tokens / (end_time - start_time)
276
+ ram_usage = get_ram_usage()
277
+ yield response, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
278
+
279
+ def process_query(query, use_rag, stream=False):
280
+ if use_rag:
281
+ retrieved_docs = retrieve(query)
282
+ context = " ".join([doc for doc, _ in retrieved_docs])
283
+ doc_ids = [doc_id for _, doc_id in retrieved_docs]
284
+ prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
285
+ else:
286
+ prompt = query
287
+ doc_ids = None
 
 
 
 
 
288
 
289
+ generator = llmguide_generate_response(prompt, doc_ids, stream)
290
 
291
  if stream:
292
  def stream_output():
293
+ for generated_text, tokens_per_second, ram_usage, doc_references in generator:
294
+ yield generated_text, tokens_per_second, ram_usage, doc_references
295
  return stream_output()
296
  else:
297
  # For non-streaming, we just need to get the final output
298
+ for generated_text, tokens_per_second, ram_usage, doc_references in generator:
299
  pass # This will iterate to the last yield
300
+ return generated_text, tokens_per_second, ram_usage, doc_references
301
+
302
 
303
  #--------------------------------------------------------------------------------------------------------------------------------
304
 
 
942
  <div style="width: 20%; text-align: center">HF + Gradio allows for api use so this my prototype tool for tool use test</div>
943
  </div>""")
944
  with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
 
945
  gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
946
+ # gr.Interface(
947
+ # fn=rag,
948
+ # inputs=[
949
+ # gr.Textbox(lines=2, placeholder="Enter your question here..."),
950
+ # gr.Checkbox(label="Stream output")
951
+ # ],
952
+ # outputs=[
953
+ # gr.Textbox(label="Generated Response"),
954
+ # gr.Textbox(label="Tokens per second"),
955
+ # gr.Textbox(label="Resource Usage")
956
+ # ],
957
+ # title="RAG Q&A System with GPU Acceleration and Resource Monitoring",
958
+ # description="Ask a question and get an answer based on the retrieved context. The response is generated using a GPU-accelerated model. Resource usage is logged at the end of generation."
959
+ # )
960
+
961
  gr.Interface(
962
+ fn=process_query,
963
  inputs=[
964
  gr.Textbox(lines=2, placeholder="Enter your question here..."),
965
+ gr.Checkbox(label="Use RAG"),
966
  gr.Checkbox(label="Stream output")
967
  ],
968
  outputs=[
969
  gr.Textbox(label="Generated Response"),
970
  gr.Textbox(label="Tokens per second"),
971
+ gr.Textbox(label="RAM Usage"),
972
+ gr.Textbox(label="Referenced Documents")
973
  ],
974
+ title="RAG/Non-RAG Q&A System",
975
+ description="Ask a question with or without using RAG. The response is generated using a GPU-accelerated model. RAM usage and referenced document IDs (for RAG) are logged."
976
  )
977
+ gr.HTML("Placeholder for FAQ type (merge as buttons on the above interface) - front end as prompt engineering for the first message to force direction of conversion")
978
+
979
+
980
+ gr.HTML("Placeholder for https://huggingface.co/h2oai/h2o-danube3-500m-chat-GGUF as alternative")
981
+ gr.HTML("Placeholder for qwen 2 72b as alternative use checkbox and gradio client api call")
982
  gr.Markdown("# Qwen-0.5B-Instruct Language Model")
983
  gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
984
  gr.HTML("Example prompts: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include? <br>What are the kinds problems that a chef story would include? <br>What are the kinds of out of reach goals that a chef story would include? <br>Continue this config - Paste any complete block of the config")