NeuralFalcon commited on
Commit
4255784
·
verified ·
1 Parent(s): bcf1741

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText
3
+ import torch
4
+
5
+ # https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct
6
+ # https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct
7
+ # model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
8
+ # model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
9
+
10
+ # Load model & processor
11
+ model_name= "SmolVLM2-2.2B-Instruct"
12
+ model_path=f"HuggingFaceTB/{model_name}"
13
+ processor = AutoProcessor.from_pretrained(model_path)
14
+
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ model = AutoModelForImageTextToText.from_pretrained(
18
+ model_path,
19
+ torch_dtype=torch.float16, # Use FP16 for better performance on T4
20
+ device_map="auto" # Auto-assign model to GPU
21
+ ).to(device)
22
+
23
+
24
+ import torch
25
+ import os
26
+ def describe_image(image_path, user_prompt="Describe the image in detail.",system_role=""):
27
+ global model, processor
28
+ messages=[]
29
+ if not os.path.exists(image_path):
30
+ return None
31
+ if system_role!="":
32
+ messages.append( {
33
+ "role": "system",
34
+ "content": [{"type": "text", "text": system_role}]
35
+ })
36
+ messages.append(
37
+ {
38
+ "role": "user",
39
+ "content": [
40
+ {"type": "text", "text": user_prompt},
41
+ {"type": "image", "path": image_path},
42
+ ]
43
+ }
44
+ )
45
+
46
+ # Prepare input
47
+ inputs = processor.apply_chat_template(
48
+ messages,
49
+ add_generation_prompt=True,
50
+ tokenize=True,
51
+ return_dict=True,
52
+ return_tensors="pt",
53
+ ).to(model.device)
54
+
55
+ # Convert only float32 tensors to float16
56
+ for k, v in inputs.items():
57
+ if v.dtype == torch.float32:
58
+ inputs[k] = v.to(torch.float16)
59
+
60
+ # Generate response
61
+ generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=1024)
62
+
63
+ # Decode and return output
64
+ generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
65
+ return generated_texts[0].split("Assistant:")[-1].replace("\n\n\n\n\n\n", "").strip()
66
+
67
+ import gradio as gr
68
+
69
+ def ui():
70
+ return gr.Interface(
71
+ fn=describe_image,
72
+ inputs=[
73
+ gr.Image(type="filepath", label="Upload Image"),
74
+ gr.Textbox(value="Describe the image in detail.", label="User Prompt"),
75
+ gr.Textbox(value="", label="System Role (Optional)")
76
+ ],
77
+ outputs=gr.Textbox(label="Image Description"),
78
+ title="Image Captioning App",
79
+ description="Upload an image and customize prompts to get a detailed description."
80
+ )
81
+ demo=ui()
82
+ demo.queue().launch()