ved1beta commited on
Commit
44157d6
·
1 Parent(s): 5b195c1
Files changed (3) hide show
  1. Readme.md +26 -0
  2. app.py +75 -64
  3. requirements.txt +6 -6
Readme.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PaliGemma Image Captioning Gradio App
2
+
3
+ ## Deployment Instructions
4
+
5
+ 1. Create a new Hugging Face Space
6
+ 2. Choose Python as the SDK
7
+ 3. Select 16GB CPU hardware
8
+ 4. Upload the following files:
9
+ - `app.py`
10
+ - `requirements.txt`
11
+
12
+ ### HuggingFace Access Token
13
+
14
+ 1. Go to HuggingFace settings
15
+ 2. Create a new access token with "Read" permissions
16
+ 3. Add the token as a secret named `HF_TOKEN` in your Space settings
17
+
18
+ ### Features
19
+ - Multi-language image captioning
20
+ - Upload custom images
21
+ - Example images included
22
+ - Supports English, Spanish, French, German captions
23
+
24
+ ## Model Details
25
+ - Model: google/paligemma-3b-mix-224
26
+ - Task: Multilingual Image Captioning
app.py CHANGED
@@ -1,74 +1,85 @@
1
  import gradio as gr
2
- from PIL import Image
3
- from transformers import AutoModelForCausalLM
4
- from transformers import AutoProcessor
5
- from transformers import TextIteratorStreamer
6
- from threading import Thread
7
  import torch
8
- import spaces
9
 
10
- model_id = "microsoft/Phi-3-vision-128k-instruct"
11
- model = AutoModelForCausalLM.from_pretrained(
12
- model_id,
13
- device_map="cpu",
14
- trust_remote_code=True,
15
- torch_dtype=torch.float32,
16
- _attn_implementation="eager"
17
- )
18
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
19
 
20
- @spaces.CPU
21
- def bot_streaming(message, history):
22
- try:
23
- image = (message["files"][-1]["path"] if isinstance(message["files"][-1], dict) else message["files"][-1]) if message["files"] else None
24
-
25
- if not image:
26
- raise ValueError("No image uploaded")
27
-
28
- conversation = []
29
- for user, assistant in history:
30
- conversation.extend([
31
- {"role": "user", "content": user},
32
- {"role": "assistant", "content": assistant or ""}
33
- ])
34
-
35
- conversation.append({"role": "user", "content": f"<|image_1|>\n{message['text']}"})
36
-
37
- prompt = processor.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
38
- image = Image.open(image)
39
- inputs = processor(prompt, image, return_tensors="pt")
40
 
41
- streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
42
- generation_kwargs = dict(
43
- inputs,
44
- streamer=streamer,
45
- max_new_tokens=256,
46
- do_sample=False,
47
- temperature=0.1,
48
- eos_token_id=processor.tokenizer.eos_token_id
49
- )
 
 
 
 
 
 
 
 
 
50
 
51
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
52
- thread.start()
 
53
 
54
- buffer = ""
55
- for new_text in streamer:
56
- buffer += new_text
57
- yield buffer
 
 
58
 
59
- except Exception as e:
60
- yield f"Error: {str(e)}"
61
-
62
- demo = gr.Blocks()
63
- with demo:
64
- gr.ChatInterface(
65
- fn=bot_streaming,
66
- title="Phi3 Vision 128K",
67
- description="Multimodal AI Vision Model",
68
- examples=[
69
- {"text": "Describe this image", "files": ["./example.jpg"]},
70
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  )
72
 
73
- demo.queue()
74
- demo.launch(debug=True)
 
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
3
+ from PIL import Image
 
 
 
4
  import torch
5
+ import requests
6
 
7
+ # Load the model and processor
8
+ model_id = "google/paligemma-3b-mix-224"
9
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, token=True).eval()
10
+ processor = AutoProcessor.from_pretrained(model_id, token=True)
 
 
 
 
 
11
 
12
+ # Supported languages and example prompts
13
+ LANGUAGES = {
14
+ "English": "caption en",
15
+ "Spanish": "caption es",
16
+ "French": "caption fr",
17
+ "German": "caption de"
18
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ def generate_caption(image, language, max_tokens=100):
21
+ """Generate image caption in specified language"""
22
+ if image is None:
23
+ return "Please upload an image."
24
+
25
+ prompt = LANGUAGES.get(language, "caption en")
26
+
27
+ # Preprocess inputs
28
+ model_inputs = processor(text=prompt, images=image, return_tensors="pt")
29
+ input_len = model_inputs["input_ids"].shape[-1]
30
+
31
+ # Generate caption
32
+ with torch.inference_mode():
33
+ generation = model.generate(**model_inputs, max_new_tokens=max_tokens, do_sample=False)
34
+ generation = generation[0][input_len:]
35
+ decoded = processor.decode(generation, skip_special_tokens=True)
36
+
37
+ return decoded
38
 
39
+ def load_example_image(url):
40
+ """Load example image from URL"""
41
+ return Image.open(requests.get(url, stream=True).raw)
42
 
43
+ # Prepare example images
44
+ EXAMPLE_IMAGES = [
45
+ load_example_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"),
46
+ load_example_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/food.jpg"),
47
+ load_example_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/city.jpg")
48
+ ]
49
 
50
+ # Create Gradio Interface
51
+ with gr.Blocks() as demo:
52
+ gr.Markdown("# PaliGemma Image Captioning")
53
+ gr.Markdown("Upload an image and get a caption in your preferred language!")
54
+
55
+ with gr.Row():
56
+ with gr.Column():
57
+ input_image = gr.Image(type="pil", label="Upload Image")
58
+ language_dropdown = gr.Dropdown(
59
+ list(LANGUAGES.keys()),
60
+ value="English",
61
+ label="Caption Language"
62
+ )
63
+ submit_btn = gr.Button("Generate Caption")
64
+
65
+ with gr.Column():
66
+ output_text = gr.Textbox(label="Generated Caption")
67
+
68
+ # Connect components
69
+ submit_btn.click(
70
+ fn=generate_caption,
71
+ inputs=[input_image, language_dropdown],
72
+ outputs=output_text
73
+ )
74
+
75
+ # Add example images
76
+ gr.Examples(
77
+ examples=[[img, lang] for img in EXAMPLE_IMAGES for lang in LANGUAGES.keys()],
78
+ inputs=[input_image, language_dropdown],
79
+ fn=generate_caption,
80
+ outputs=output_text
81
  )
82
 
83
+ # Launch the app
84
+ if __name__ == "__main__":
85
+ demo.launch()
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- #flash_attn
2
- accelerate
3
- git+https://github.com/huggingface/transformers.git@main
4
- spaces
5
- torchvision
6
-
 
1
+ torch>=2.0.0
2
+ transformers>=4.37.0
3
+ gradio>=4.0.0
4
+ pillow>=10.0.0
5
+ huggingface_hub
6
+ requests