Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,7 @@ import io
|
|
| 21 |
import datasets
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
-
from transformers import
|
| 25 |
from transformers import Idefics2ForConditionalGeneration
|
| 26 |
import tempfile
|
| 27 |
from streaming_stt_nemo import Model
|
|
@@ -29,15 +29,18 @@ from huggingface_hub import InferenceClient
|
|
| 29 |
import edge_tts
|
| 30 |
import asyncio
|
| 31 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
|
| 35 |
|
| 36 |
-
@spaces.GPU(
|
| 37 |
-
def
|
| 38 |
-
inputs = processor(text=[
|
| 39 |
with torch.inference_mode():
|
| 40 |
-
output =
|
| 41 |
**inputs,
|
| 42 |
do_sample=False,
|
| 43 |
use_cache=True,
|
|
@@ -48,9 +51,9 @@ def generate_caption(image, prompt):
|
|
| 48 |
|
| 49 |
prompt_len = inputs["input_ids"].shape[1]
|
| 50 |
decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
theme = gr.themes.Base(
|
| 56 |
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
|
|
@@ -118,24 +121,16 @@ SYSTEM_PROMPT = [
|
|
| 118 |
{
|
| 119 |
"type": "text",
|
| 120 |
"text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include:
|
| 121 |
-
|
| 122 |
- **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information.
|
| 123 |
-
|
| 124 |
- **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals:
|
| 125 |
-
|
| 126 |
> 
|
| 127 |
-
|
| 128 |
For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience.
|
| 129 |
-
|
| 130 |
For instance, if the User requests:
|
| 131 |
-
|
| 132 |
[USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars.
|
| 133 |
[OpenGPT 4o] Generating Image you requested:
|
| 134 |

|
| 135 |
-
|
| 136 |
**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.
|
| 137 |
Note: Make sure to always provide image links starting with ! .As given in examples.
|
| 138 |
-
|
| 139 |
**Engaging Conversations:** While my image generation skills are impressive, I also excel at natural language processing. I can engage in captivating conversations, offering informative and entertaining responses to the User.
|
| 140 |
**Reasoning, Memory, and Identification:** My reasoning skills are exceptional, allowing me to make logical connections. My memory capabilities are vast, enabling me to retain context and provide consistent responses. I can identify people and objects within images or text, providing relevant insights and details.
|
| 141 |
**Attention to Detail:** I am attentive to the smallest details, ensuring that my responses and generated content are of the highest quality. I strive to provide a refined and polished experience.
|
|
@@ -385,8 +380,6 @@ def model_inference(
|
|
| 385 |
if acc_text.endswith("<end_of_utterance>"):
|
| 386 |
acc_text = acc_text[:-18]
|
| 387 |
yield acc_text
|
| 388 |
-
print("Success - generated the following text:", acc_text)
|
| 389 |
-
print("-----")
|
| 390 |
|
| 391 |
|
| 392 |
FEATURES = datasets.Features(
|
|
@@ -542,15 +535,13 @@ with gr.Blocks() as voice2:
|
|
| 542 |
outputs=[output], live=True)
|
| 543 |
|
| 544 |
with gr.Blocks() as video:
|
| 545 |
-
gr.Markdown(" ## Live Chat")
|
| 546 |
-
gr.Markdown("### Click camera option to update image")
|
| 547 |
gr.Interface(
|
| 548 |
-
fn=
|
| 549 |
inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
|
| 550 |
-
outputs=gr.Textbox(label="Answer")
|
| 551 |
)
|
| 552 |
|
| 553 |
-
with gr.Blocks(theme=theme,
|
| 554 |
gr.Markdown("# OpenGPT 4o")
|
| 555 |
gr.TabbedInterface([img, voice, video, voice2], ['💬 SuperChat','🗣️ Voice Chat','📸 Live Chat', '🗣️ Voice Chat 2'])
|
| 556 |
|
|
|
|
| 21 |
import datasets
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
+
from transformers import TextIteratorStreamer
|
| 25 |
from transformers import Idefics2ForConditionalGeneration
|
| 26 |
import tempfile
|
| 27 |
from streaming_stt_nemo import Model
|
|
|
|
| 29 |
import edge_tts
|
| 30 |
import asyncio
|
| 31 |
from transformers import pipeline
|
| 32 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 33 |
+
from transformers import AutoModel
|
| 34 |
+
from transformers import AutoProcessor
|
| 35 |
|
| 36 |
+
model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
|
| 37 |
processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
|
| 38 |
|
| 39 |
+
@spaces.GPU(queue=False)
|
| 40 |
+
def videochat(image3, prompt3):
|
| 41 |
+
inputs = processor(text=[prompt3], images=[image3], return_tensors="pt")
|
| 42 |
with torch.inference_mode():
|
| 43 |
+
output = model3.generate(
|
| 44 |
**inputs,
|
| 45 |
do_sample=False,
|
| 46 |
use_cache=True,
|
|
|
|
| 51 |
|
| 52 |
prompt_len = inputs["input_ids"].shape[1]
|
| 53 |
decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
|
| 54 |
+
if decoded_text.endswith("<|im_end|>"):
|
| 55 |
+
decoded_text = decoded_text[:-18]
|
| 56 |
+
yield acc_text
|
| 57 |
|
| 58 |
theme = gr.themes.Base(
|
| 59 |
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
|
|
|
|
| 121 |
{
|
| 122 |
"type": "text",
|
| 123 |
"text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include:
|
|
|
|
| 124 |
- **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information.
|
|
|
|
| 125 |
- **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals:
|
|
|
|
| 126 |
> 
|
|
|
|
| 127 |
For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience.
|
|
|
|
| 128 |
For instance, if the User requests:
|
|
|
|
| 129 |
[USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars.
|
| 130 |
[OpenGPT 4o] Generating Image you requested:
|
| 131 |

|
|
|
|
| 132 |
**Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User.
|
| 133 |
Note: Make sure to always provide image links starting with ! .As given in examples.
|
|
|
|
| 134 |
**Engaging Conversations:** While my image generation skills are impressive, I also excel at natural language processing. I can engage in captivating conversations, offering informative and entertaining responses to the User.
|
| 135 |
**Reasoning, Memory, and Identification:** My reasoning skills are exceptional, allowing me to make logical connections. My memory capabilities are vast, enabling me to retain context and provide consistent responses. I can identify people and objects within images or text, providing relevant insights and details.
|
| 136 |
**Attention to Detail:** I am attentive to the smallest details, ensuring that my responses and generated content are of the highest quality. I strive to provide a refined and polished experience.
|
|
|
|
| 380 |
if acc_text.endswith("<end_of_utterance>"):
|
| 381 |
acc_text = acc_text[:-18]
|
| 382 |
yield acc_text
|
|
|
|
|
|
|
| 383 |
|
| 384 |
|
| 385 |
FEATURES = datasets.Features(
|
|
|
|
| 535 |
outputs=[output], live=True)
|
| 536 |
|
| 537 |
with gr.Blocks() as video:
|
|
|
|
|
|
|
| 538 |
gr.Interface(
|
| 539 |
+
fn=videochat,
|
| 540 |
inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Prompt", value="what he is doing")],
|
| 541 |
+
outputs=gr.Textbox(label="Answer")
|
| 542 |
)
|
| 543 |
|
| 544 |
+
with gr.Blocks(theme=theme, title="OpenGPT 4o DEMO") as demo:
|
| 545 |
gr.Markdown("# OpenGPT 4o")
|
| 546 |
gr.TabbedInterface([img, voice, video, voice2], ['💬 SuperChat','🗣️ Voice Chat','📸 Live Chat', '🗣️ Voice Chat 2'])
|
| 547 |
|