MAsad789565 commited on
Commit
87ca261
·
verified ·
1 Parent(s): b12c822

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -60
app.py CHANGED
@@ -1,15 +1,11 @@
1
  import asyncio
2
  import gradio as gr
3
- from huggingface_hub import InferenceClient
4
- import json
5
- import os
6
- import requests
7
  from huggingface_hub import AsyncInferenceClient
8
 
9
  client = AsyncInferenceClient("meta-llama/Llama-2-70b-chat-hf")
10
 
11
-
12
  system_message = "\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
 
13
  title = "Llama2 70B Chatbot"
14
  description = """
15
  This Space demonstrates model [Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) by Meta, a Llama 2 model with 70B parameters fine-tuned for chat instructions. This space is running on Inference Endpoints using text-generation-inference library. If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://ui.endpoints.huggingface.co/).
@@ -21,64 +17,66 @@ Note: As a derivate work of [Llama-2-70b-chat](https://huggingface.co/meta-llama
21
  this demo is governed by the original [license](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/USE_POLICY.md).
22
  """
23
  css = """.toast-wrap { display: none !important } """
24
- examples=[
25
  ['Hello there! How are you doing?'],
26
  ['Can you explain to me briefly what is Python programming language?'],
27
  ['Explain the plot of Cinderella in a sentence.'],
28
  ['How many hours does it take a man to eat a Helicopter?'],
29
  ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
30
- ]
31
-
32
 
33
  # Note: We have removed default system prompt as requested by the paper authors [Dated: 13/Oct/2023]
34
  # Prompting style for Llama2 without using system prompt
35
  # <s>[INST] {{ user_msg_1 }} [/INST] {{ model_answer_1 }} </s><s>[INST] {{ user_msg_2 }} [/INST]
36
 
37
-
38
  # Stream text - stream tokens with InferenceClient from TGI
39
  async def predict(message, chatbot, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):
40
-
41
  if system_prompt != "":
42
  input_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n "
43
  else:
44
  input_prompt = f"<s>[INST] "
45
-
46
  temperature = float(temperature)
47
  if temperature < 1e-2:
48
  temperature = 1e-2
49
  top_p = float(top_p)
50
-
51
  for interaction in chatbot:
52
  input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s>[INST] "
53
 
54
  input_prompt = input_prompt + str(message) + " [/INST] "
55
 
56
  partial_message = ""
57
- async for token in await client.text_generation(prompt=input_prompt,
58
- max_new_tokens=max_new_tokens,
59
- stream=True,
60
- best_of=1,
61
- temperature=temperature,
62
- top_p=top_p,
63
- do_sample=True,
64
- repetition_penalty=repetition_penalty):
65
- partial_message = partial_message + token
 
 
66
  yield partial_message
67
-
68
 
69
  # No Stream - batch produce tokens using TGI inference endpoint
70
- def predict_batch(message, chatbot, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):
71
-
 
 
72
  if system_prompt != "":
73
  input_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n "
74
  else:
75
  input_prompt = f"<s>[INST] "
76
-
77
  temperature = float(temperature)
78
  if temperature < 1e-2:
79
  temperature = 1e-2
80
  top_p = float(top_p)
81
-
82
  for interaction in chatbot:
83
  input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s>[INST] "
84
 
@@ -88,16 +86,16 @@ def predict_batch(message, chatbot, system_prompt="", temperature=0.9, max_new_t
88
  data = {
89
  "inputs": input_prompt,
90
  "parameters": {
91
- "max_new_tokens":max_new_tokens,
92
- "temperature":temperature,
93
- "top_p":top_p,
94
- "repetition_penalty":repetition_penalty,
95
- "do_sample":True,
96
  },
97
  }
98
 
99
- response = requests.post(api_url, headers=headers, json=data ) #auth=('hf', hf_token)) data=json.dumps(data),
100
-
101
  if response.status_code == 200: # check if the request was successful
102
  try:
103
  json_obj = response.json()
@@ -113,15 +111,14 @@ def predict_batch(message, chatbot, system_prompt="", temperature=0.9, max_new_t
113
  print(f"Request failed with status code {response.status_code}")
114
 
115
 
116
-
117
  def vote(data: gr.LikeData):
118
  if data.liked:
119
  print("You upvoted this response: " + data.value)
120
  else:
121
  print("You downvoted this response: " + data.value)
122
-
123
 
124
- additional_inputs=[
 
125
  gr.Textbox("", label="Optional system prompt"),
126
  gr.Slider(
127
  label="Temperature",
@@ -161,28 +158,30 @@ additional_inputs=[
161
  )
162
  ]
163
 
164
- chatbot_stream = gr.Chatbot(avatar_images=('user.png', 'bot2.png'),bubble_full_width = False)
165
- chatbot_batch = gr.Chatbot(avatar_images=('user1.png', 'bot1.png'),bubble_full_width = False)
166
- chat_interface_stream = gr.ChatInterface(predict,
167
- title=title,
168
- description=description,
169
- textbox=gr.Textbox(),
170
- chatbot=chatbot_stream,
171
- css=css,
172
- examples=examples,
173
- #cache_examples=True,
174
- additional_inputs=additional_inputs,)
175
- chat_interface_batch=gr.ChatInterface(predict_batch,
176
- title=title,
177
- description=description,
178
- textbox=gr.Textbox(),
179
- chatbot=chatbot_batch,
180
- css=css,
181
- examples=examples,
182
- #cache_examples=True,
183
- additional_inputs=additional_inputs,)
184
-
185
- # Gradio Demo
 
 
186
  with gr.Blocks() as demo:
187
 
188
  with gr.Tab("Streaming"):
@@ -194,5 +193,5 @@ with gr.Blocks() as demo:
194
  # non-streaming chatbot
195
  chatbot_batch.like(vote, None, None)
196
  chat_interface_batch.render()
197
-
198
- demo.queue(max_size=100).launch()
 
1
  import asyncio
2
  import gradio as gr
 
 
 
 
3
  from huggingface_hub import AsyncInferenceClient
4
 
5
  client = AsyncInferenceClient("meta-llama/Llama-2-70b-chat-hf")
6
 
 
7
  system_message = "\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
8
+
9
  title = "Llama2 70B Chatbot"
10
  description = """
11
  This Space demonstrates model [Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) by Meta, a Llama 2 model with 70B parameters fine-tuned for chat instructions. This space is running on Inference Endpoints using text-generation-inference library. If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://ui.endpoints.huggingface.co/).
 
17
  this demo is governed by the original [license](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/USE_POLICY.md).
18
  """
19
  css = """.toast-wrap { display: none !important } """
20
+ examples = [
21
  ['Hello there! How are you doing?'],
22
  ['Can you explain to me briefly what is Python programming language?'],
23
  ['Explain the plot of Cinderella in a sentence.'],
24
  ['How many hours does it take a man to eat a Helicopter?'],
25
  ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
26
+ ]
 
27
 
28
  # Note: We have removed default system prompt as requested by the paper authors [Dated: 13/Oct/2023]
29
  # Prompting style for Llama2 without using system prompt
30
  # <s>[INST] {{ user_msg_1 }} [/INST] {{ model_answer_1 }} </s><s>[INST] {{ user_msg_2 }} [/INST]
31
 
 
32
  # Stream text - stream tokens with InferenceClient from TGI
33
  async def predict(message, chatbot, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):
34
+
35
  if system_prompt != "":
36
  input_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n "
37
  else:
38
  input_prompt = f"<s>[INST] "
39
+
40
  temperature = float(temperature)
41
  if temperature < 1e-2:
42
  temperature = 1e-2
43
  top_p = float(top_p)
44
+
45
  for interaction in chatbot:
46
  input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s>[INST] "
47
 
48
  input_prompt = input_prompt + str(message) + " [/INST] "
49
 
50
  partial_message = ""
51
+ async for token in await client.text_generation(
52
+ prompt=input_prompt,
53
+ max_new_tokens=max_new_tokens,
54
+ stream=True,
55
+ best_of=1,
56
+ temperature=temperature,
57
+ top_p=top_p,
58
+ do_sample=True,
59
+ repetition_penalty=repetition_penalty,
60
+ ):
61
+ partial_message = partial_message + token
62
  yield partial_message
63
+
64
 
65
  # No Stream - batch produce tokens using TGI inference endpoint
66
+ def predict_batch(
67
+ message, chatbot, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,
68
+ ):
69
+
70
  if system_prompt != "":
71
  input_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n "
72
  else:
73
  input_prompt = f"<s>[INST] "
74
+
75
  temperature = float(temperature)
76
  if temperature < 1e-2:
77
  temperature = 1e-2
78
  top_p = float(top_p)
79
+
80
  for interaction in chatbot:
81
  input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s>[INST] "
82
 
 
86
  data = {
87
  "inputs": input_prompt,
88
  "parameters": {
89
+ "max_new_tokens": max_new_tokens,
90
+ "temperature": temperature,
91
+ "top_p": top_p,
92
+ "repetition_penalty": repetition_penalty,
93
+ "do_sample": True,
94
  },
95
  }
96
 
97
+ response = requests.post(api_url, headers=headers, json=data) # auth=('hf', hf_token)) data=json.dumps(data),
98
+
99
  if response.status_code == 200: # check if the request was successful
100
  try:
101
  json_obj = response.json()
 
111
  print(f"Request failed with status code {response.status_code}")
112
 
113
 
 
114
  def vote(data: gr.LikeData):
115
  if data.liked:
116
  print("You upvoted this response: " + data.value)
117
  else:
118
  print("You downvoted this response: " + data.value)
 
119
 
120
+
121
+ additional_inputs = [
122
  gr.Textbox("", label="Optional system prompt"),
123
  gr.Slider(
124
  label="Temperature",
 
158
  )
159
  ]
160
 
161
+ chatbot_stream = gr.Chatbot(bubble_full_width=False)
162
+ chatbot_batch = gr.Chatbot(bubble_full_width=False)
163
+ chat_interface_stream = gr.ChatInterface(
164
+ predict,
165
+ title=title,
166
+ description=description,
167
+ textbox=gr.Textbox(),
168
+ chatbot=chatbot_stream,
169
+ css=css,
170
+ examples=examples,
171
+ additional_inputs=additional_inputs,
172
+ )
173
+ chat_interface_batch = gr.ChatInterface(
174
+ predict_batch,
175
+ title=title,
176
+ description=description,
177
+ textbox=gr.Textbox(),
178
+ chatbot=chatbot_batch,
179
+ css=css,
180
+ examples=examples,
181
+ additional_inputs=additional_inputs,
182
+ )
183
+
184
+ # Gradio Demo
185
  with gr.Blocks() as demo:
186
 
187
  with gr.Tab("Streaming"):
 
193
  # non-streaming chatbot
194
  chatbot_batch.like(vote, None, None)
195
  chat_interface_batch.render()
196
+
197
+ demo.queue(max_size=100).launch()