Spaces:

breadlicker45
/

PaliGemma2

Running on Zero

App Files Files Community

breadlicker45 commited on Dec 15, 2024

Commit

1de48dc

verified ·

1 Parent(s): eef3b6b

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -5

app.py CHANGED Viewed

@@ -23,18 +23,18 @@ def load_model():
         )
     # Load the processor and model using the correct identifier
-    model_id = "google/paligemma2-10b-pt-224"
-    processor = PaliGemmaProcessor.from_pretrained(model_id, token=token)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = PaliGemmaForConditionalGeneration.from_pretrained(
-        model_id, torch_dtype=torch.bfloat16, token=token
     ).to(device).eval()
     return processor, model
 @spaces.GPU(duration=120)  # Increased timeout to 120 seconds
-def process_image_and_text(image_pil, text_input):
     """Extract text from image using PaliGemma2."""
     try:
         processor, model = load_model()
@@ -43,6 +43,9 @@ def process_image_and_text(image_pil, text_input):
         # Load the image using load_image
         image = load_image(image_pil)
         # Use the provided text input
         model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
             device, dtype=torch.bfloat16
@@ -50,7 +53,7 @@ def process_image_and_text(image_pil, text_input):
         input_len = model_inputs["input_ids"].shape[-1]
         with torch.inference_mode():
-            generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
             generation = generation[0][input_len:]
             decoded = processor.decode(generation, skip_special_tokens=True)
@@ -66,6 +69,7 @@ if __name__ == "__main__":
         inputs=[
             gr.Image(type="pil", label="Upload an image"),
             gr.Textbox(label="Enter Text Prompt"),
         ],
         outputs=gr.Textbox(label="Generated Text"),
         title="PaliGemma2 Image and Text to Text",

         )
     # Load the processor and model using the correct identifier
+    model_id = "google/paligemma2-28b-pt-896"
+    processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = PaliGemmaForConditionalGeneration.from_pretrained(
+        model_id, torch_dtype=torch.bfloat16, use_auth_token=token
     ).to(device).eval()
     return processor, model
 @spaces.GPU(duration=120)  # Increased timeout to 120 seconds
+def process_image_and_text(image_pil, text_input, num_beams):
     """Extract text from image using PaliGemma2."""
     try:
         processor, model = load_model()
         # Load the image using load_image
         image = load_image(image_pil)
+        # Add <image> token to the beginning of the text prompt
+        text_input = "<image> " + text_input
         # Use the provided text input
         model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
             device, dtype=torch.bfloat16
         input_len = model_inputs["input_ids"].shape[-1]
         with torch.inference_mode():
+            generation = model.generate(**model_inputs, max_new_tokens=200, do_sample=False, num_beams=num_beams)
             generation = generation[0][input_len:]
             decoded = processor.decode(generation, skip_special_tokens=True)
         inputs=[
             gr.Image(type="pil", label="Upload an image"),
             gr.Textbox(label="Enter Text Prompt"),
+            gr.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Beams"),
         ],
         outputs=gr.Textbox(label="Generated Text"),
         title="PaliGemma2 Image and Text to Text",