teragron commited on
Commit
0a424dd
Β·
verified Β·
1 Parent(s): 7aff440

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
3
+ from transformers.image_utils import load_image
4
+ from threading import Thread
5
+ import time
6
+ import torch
7
+
8
+ # Load the SmolVLM model and processor
9
+ print("πŸ”§ Loading SmolVLM model...")
10
+ processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M")
11
+ model = AutoModelForVision2Seq.from_pretrained(
12
+ "HuggingFaceTB/SmolVLM-Instruct-250M",
13
+ torch_dtype=torch.bfloat16,
14
+ device_map="auto" # Automatically handles CPU/GPU placement
15
+ )
16
+ print("βœ… Model loaded successfully!")
17
+
18
+ def model_inference(input_dict, history):
19
+ """Process multimodal input and generate response"""
20
+ text = input_dict["text"]
21
+
22
+ # Handle image input
23
+ if len(input_dict["files"]) > 1:
24
+ images = [load_image(image) for image in input_dict["files"]]
25
+ elif len(input_dict["files"]) == 1:
26
+ images = [load_image(input_dict["files"][0])]
27
+ else:
28
+ images = []
29
+
30
+ # Validation
31
+ if text == "" and not images:
32
+ raise gr.Error("Please input a query and optionally image(s).")
33
+
34
+ if text == "" and images:
35
+ raise gr.Error("Please input a text query along with the image(s).")
36
+
37
+ # Prepare the conversation format
38
+ resulting_messages = [
39
+ {
40
+ "role": "user",
41
+ "content": [{"type": "image"} for _ in range(len(images))] + [
42
+ {"type": "text", "text": text}
43
+ ]
44
+ }
45
+ ]
46
+
47
+ try:
48
+ # Apply chat template and process inputs
49
+ prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
50
+ inputs = processor(text=prompt, images=images if images else None, return_tensors="pt")
51
+
52
+ # Move to appropriate device
53
+ device = next(model.parameters()).device
54
+ inputs = {k: v.to(device) if v is not None else v for k, v in inputs.items()}
55
+
56
+ # Set up streaming generation
57
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
58
+ generation_kwargs = dict(
59
+ inputs,
60
+ streamer=streamer,
61
+ max_new_tokens=500,
62
+ min_new_tokens=10,
63
+ no_repeat_ngram_size=2,
64
+ do_sample=True,
65
+ temperature=0.7,
66
+ top_p=0.9
67
+ )
68
+
69
+ # Start generation in separate thread
70
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
71
+ thread.start()
72
+
73
+ # Stream the response
74
+ yield "Thinking..."
75
+ buffer = ""
76
+
77
+ for new_text in streamer:
78
+ buffer += new_text
79
+ time.sleep(0.02) # Small delay for smooth streaming
80
+ yield buffer
81
+
82
+ except Exception as e:
83
+ yield f"❌ Error generating response: {str(e)}"
84
+
85
+ # Example prompts and images for demonstration
86
+ examples = [
87
+ [{"text": "What do you see in this image?", "files": []}],
88
+ [{"text": "Describe the colors and objects in this image in detail.", "files": []}],
89
+ [{"text": "What is the mood or atmosphere of this image?", "files": []}],
90
+ [{"text": "Are there any people in this image? What are they doing?", "files": []}],
91
+ [{"text": "What text can you read in this image?", "files": []}],
92
+ [{"text": "Count the number of objects you can see.", "files": []}],
93
+ ]
94
+
95
+ # Create the Gradio interface using ChatInterface
96
+ demo = gr.ChatInterface(
97
+ fn=model_inference,
98
+ title="πŸ” SmolVLM Vision Chat",
99
+ description="""
100
+ Chat with **SmolVLM-256M**, a compact but powerful vision-language model!
101
+
102
+ **How to use:**
103
+ 1. Upload one or more images using the πŸ“Ž button
104
+ 2. Ask questions about the images
105
+ 3. Get detailed AI-generated descriptions and answers
106
+
107
+ **Example questions:**
108
+ - "What do you see in this image?"
109
+ - "Describe the colors and composition"
110
+ - "What text is visible in this image?"
111
+ - "Count the objects in this image"
112
+
113
+ This model can analyze photos, diagrams, documents, artwork, and more!
114
+ """,
115
+ examples=examples,
116
+ textbox=gr.MultimodalTextbox(
117
+ label="πŸ’¬ Ask about your images...",
118
+ file_types=["image"],
119
+ file_count="multiple",
120
+ placeholder="Upload images and ask questions about them!"
121
+ ),
122
+ stop_btn="⏹️ Stop Generation",
123
+ multimodal=True,
124
+ cache_examples=False,
125
+ theme=gr.themes.Soft(),
126
+ css="""
127
+ .gradio-container {
128
+ max-width: 1000px !important;
129
+ }
130
+ .chat-message {
131
+ border-radius: 10px !important;
132
+ }
133
+ """
134
+ )
135
+
136
+ if __name__ == "__main__":
137
+ print("πŸš€ Launching SmolVLM Chat Interface...")
138
+ demo.launch(
139
+ server_name="0.0.0.0",
140
+ server_port=7860,
141
+ share=False,
142
+ show_error=True
143
+ )