Spaces:
Paused
Paused
Added models in Voice chat and Improved UI
Browse files
app.py
CHANGED
|
@@ -54,9 +54,20 @@ def videochat(image3, prompt3):
|
|
| 54 |
decoded_text = decoded_text[:-10]
|
| 55 |
yield decoded_text
|
| 56 |
|
| 57 |
-
theme = gr.themes.
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
MODEL_NAME = "openai/whisper-medium"
|
| 62 |
BATCH_SIZE = 10
|
|
@@ -78,18 +89,39 @@ def transcribe(inputs):
|
|
| 78 |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"})["text"]
|
| 79 |
return text
|
| 80 |
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
|
| 84 |
|
| 85 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
generate_kwargs = dict(
|
| 87 |
temperature=0.7,
|
| 88 |
max_new_tokens=512,
|
| 89 |
top_p=0.95,
|
| 90 |
repetition_penalty=1,
|
| 91 |
do_sample=True,
|
| 92 |
-
seed=
|
| 93 |
)
|
| 94 |
|
| 95 |
formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
|
|
@@ -115,7 +147,7 @@ DEVICE = torch.device("cuda")
|
|
| 115 |
MODELS = {
|
| 116 |
"idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
|
| 117 |
"HuggingFaceM4/idefics2-8b-chatty",
|
| 118 |
-
torch_dtype=torch.
|
| 119 |
_attn_implementation="flash_attention_2",
|
| 120 |
).to(DEVICE),
|
| 121 |
}
|
|
@@ -521,16 +553,12 @@ with gr.Blocks() as voice:
|
|
| 521 |
autoplay=True,
|
| 522 |
elem_classes="audio")
|
| 523 |
gr.Interface(
|
| 524 |
-
batch=True,
|
| 525 |
-
max_batch_size=10,
|
| 526 |
fn=respond,
|
| 527 |
inputs=[input],
|
| 528 |
-
outputs=[output], live=True)
|
| 529 |
|
| 530 |
with gr.Blocks() as livechat:
|
| 531 |
gr.Interface(
|
| 532 |
-
batch=True,
|
| 533 |
-
max_batch_size=10,
|
| 534 |
fn=videochat,
|
| 535 |
inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
|
| 536 |
outputs=gr.Textbox(label="Answer")
|
|
|
|
| 54 |
decoded_text = decoded_text[:-10]
|
| 55 |
yield decoded_text
|
| 56 |
|
| 57 |
+
theme = gr.themes.Soft(
|
| 58 |
+
primary_hue="blue",
|
| 59 |
+
secondary_hue="orange",
|
| 60 |
+
neutral_hue="gray",
|
| 61 |
+
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif']).set(
|
| 62 |
+
body_background_fill_dark="#111111",
|
| 63 |
+
block_background_fill_dark="#111111",
|
| 64 |
+
block_border_width="1px",
|
| 65 |
+
block_title_background_fill_dark="#1e1c26",
|
| 66 |
+
input_background_fill_dark="#292733",
|
| 67 |
+
button_secondary_background_fill_dark="#24212b",
|
| 68 |
+
border_color_primary_dark="#343140",
|
| 69 |
+
background_fill_secondary_dark="#111111",
|
| 70 |
+
color_accent_soft_dark="transparent")
|
| 71 |
|
| 72 |
MODEL_NAME = "openai/whisper-medium"
|
| 73 |
BATCH_SIZE = 10
|
|
|
|
| 89 |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"})["text"]
|
| 90 |
return text
|
| 91 |
|
| 92 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 93 |
+
|
| 94 |
+
def client_fn(model):
|
| 95 |
+
if "Mixtral" in model:
|
| 96 |
+
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
| 97 |
+
elif "Llama" in model:
|
| 98 |
+
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
|
| 99 |
+
elif "Mistral" in model:
|
| 100 |
+
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
|
| 101 |
+
elif "Phi" in model:
|
| 102 |
+
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
|
| 103 |
+
else:
|
| 104 |
+
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
|
| 105 |
+
|
| 106 |
+
def randomize_seed_fn(seed: int) -> int:
|
| 107 |
+
seed = random.randint(0, 999999)
|
| 108 |
+
return seed
|
| 109 |
|
| 110 |
system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
|
| 111 |
|
| 112 |
+
def models(text, model="Mixtral 8x7B", seed=42):
|
| 113 |
+
|
| 114 |
+
seed = int(randomize_seed_fn(seed))
|
| 115 |
+
generator = torch.Generator().manual_seed(seed)
|
| 116 |
+
|
| 117 |
+
client = client_fn(model)
|
| 118 |
generate_kwargs = dict(
|
| 119 |
temperature=0.7,
|
| 120 |
max_new_tokens=512,
|
| 121 |
top_p=0.95,
|
| 122 |
repetition_penalty=1,
|
| 123 |
do_sample=True,
|
| 124 |
+
seed=seed,
|
| 125 |
)
|
| 126 |
|
| 127 |
formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
|
|
|
|
| 147 |
MODELS = {
|
| 148 |
"idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
|
| 149 |
"HuggingFaceM4/idefics2-8b-chatty",
|
| 150 |
+
torch_dtype=torch.float16,
|
| 151 |
_attn_implementation="flash_attention_2",
|
| 152 |
).to(DEVICE),
|
| 153 |
}
|
|
|
|
| 553 |
autoplay=True,
|
| 554 |
elem_classes="audio")
|
| 555 |
gr.Interface(
|
|
|
|
|
|
|
| 556 |
fn=respond,
|
| 557 |
inputs=[input],
|
| 558 |
+
outputs=[output], api_name="translate", live=True)
|
| 559 |
|
| 560 |
with gr.Blocks() as livechat:
|
| 561 |
gr.Interface(
|
|
|
|
|
|
|
| 562 |
fn=videochat,
|
| 563 |
inputs=[gr.Image(type="pil",sources="webcam", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
|
| 564 |
outputs=gr.Textbox(label="Answer")
|