Spaces:

teragron
/

smoldev2

Sleeping

App Files Files Community

teragron commited on Jun 2

Commit

0a424dd

verified ·

1 Parent(s): 7aff440

Create app.py

Browse files

Files changed (1) hide show

app.py +143 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import gradio as gr
+from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
+from transformers.image_utils import load_image
+from threading import Thread
+import time
+import torch
+# Load the SmolVLM model and processor
+print("🔧 Loading SmolVLM model...")
+processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M")
+model = AutoModelForVision2Seq.from_pretrained(
+    "HuggingFaceTB/SmolVLM-Instruct-250M",
+    torch_dtype=torch.bfloat16,
+    device_map="auto"  # Automatically handles CPU/GPU placement
+)
+print("✅ Model loaded successfully!")
+def model_inference(input_dict, history):
+    """Process multimodal input and generate response"""
+    text = input_dict["text"]
+    # Handle image input
+    if len(input_dict["files"]) > 1:
+        images = [load_image(image) for image in input_dict["files"]]
+    elif len(input_dict["files"]) == 1:
+        images = [load_image(input_dict["files"][0])]
+    else:
+        images = []
+    # Validation
+    if text == "" and not images:
+        raise gr.Error("Please input a query and optionally image(s).")
+    if text == "" and images:
+        raise gr.Error("Please input a text query along with the image(s).")
+    # Prepare the conversation format
+    resulting_messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"} for _ in range(len(images))] + [
+                {"type": "text", "text": text}
+            ]
+        }
+    ]
+    try:
+        # Apply chat template and process inputs
+        prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images if images else None, return_tensors="pt")
+        # Move to appropriate device
+        device = next(model.parameters()).device
+        inputs = {k: v.to(device) if v is not None else v for k, v in inputs.items()}
+        # Set up streaming generation
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=500,
+            min_new_tokens=10,
+            no_repeat_ngram_size=2,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9
+        )
+        # Start generation in separate thread
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Stream the response
+        yield "Thinking..."
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            time.sleep(0.02)  # Small delay for smooth streaming
+            yield buffer
+    except Exception as e:
+        yield f"❌ Error generating response: {str(e)}"
+# Example prompts and images for demonstration
+examples = [
+    [{"text": "What do you see in this image?", "files": []}],
+    [{"text": "Describe the colors and objects in this image in detail.", "files": []}],
+    [{"text": "What is the mood or atmosphere of this image?", "files": []}],
+    [{"text": "Are there any people in this image? What are they doing?", "files": []}],
+    [{"text": "What text can you read in this image?", "files": []}],
+    [{"text": "Count the number of objects you can see.", "files": []}],
+]
+# Create the Gradio interface using ChatInterface
+demo = gr.ChatInterface(
+    fn=model_inference,
+    title="🔍 SmolVLM Vision Chat",
+    description="""
+    Chat with **SmolVLM-256M**, a compact but powerful vision-language model!
+    **How to use:**
+    1. Upload one or more images using the 📎 button
+    2. Ask questions about the images
+    3. Get detailed AI-generated descriptions and answers
+    **Example questions:**
+    - "What do you see in this image?"
+    - "Describe the colors and composition"
+    - "What text is visible in this image?"
+    - "Count the objects in this image"
+    This model can analyze photos, diagrams, documents, artwork, and more!
+    """,
+    examples=examples,
+    textbox=gr.MultimodalTextbox(
+        label="💬 Ask about your images...",
+        file_types=["image"],
+        file_count="multiple",
+        placeholder="Upload images and ask questions about them!"
+    ),
+    stop_btn="⏹️ Stop Generation",
+    multimodal=True,
+    cache_examples=False,
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 1000px !important;
+    }
+    .chat-message {
+        border-radius: 10px !important;
+    }
+    """
+)
+if __name__ == "__main__":
+    print("🚀 Launching SmolVLM Chat Interface...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )