demo-image-description-with-label

Sleeping

App Files Files Community

rodrigomasini commited on May 1, 2024

Commit

b03699f

verified ·

1 Parent(s): 02572c5

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -39

app.py CHANGED Viewed

@@ -165,18 +165,15 @@ def draw_entity_boxes_on_image(image, entities, show=False, save_path=None, enti
 def main():
-    ckpt = "microsoft/kosmos-2-patch14-224"
     model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
     processor = AutoProcessor.from_pretrained(ckpt)
     def generate_predictions(image_input, text_input):
-        # Save the image and load it again to match the original Kosmos-2 demo.
-        # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
         user_image_path = "/tmp/user_input_test_image.jpg"
         image_input.save(user_image_path)
-        # This might give different results from the original argument `image_input`
         image_input = Image.open(user_image_path)
         if text_input == "Brief":
@@ -195,7 +192,7 @@ def main():
             image_embeds=None,
             image_embeds_position_mask=inputs["image_embeds_position_mask"],
             use_cache=True,
-            max_new_tokens=128,
         )
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -214,9 +211,6 @@ def main():
                 # skip bounding bbox without a `phrase` associated
                 continue
             color_id += 1
-            # for bbox_id, _ in enumerate(bboxes):
-                # if start is None and bbox_id > 0:
-                #     color_id += 1
             entity_info.append(((start, end), color_id))
             filtered_entities.append(entity)
@@ -234,21 +228,9 @@ def main():
         return annotated_image, colored_text, str(filtered_entities)
-    term_of_use = """
-    ### Terms of use
-    By using this model, users are required to agree to the following terms:
-    The model is intended for academic and research purposes.
-    The utilization of the model to create unsuitable material is strictly forbidden and not endorsed by this work.
-    The accountability for any improper or unacceptable application of the model rests exclusively with the individuals who generated such content.
-    ### License
-    This project is licensed under the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct).
-    """
-    with gr.Blocks(title="Kosmos-2", theme=gr.themes.Base()).queue() as demo:
         gr.Markdown(("""
-            # Kosmos-2: Grounding Multimodal Large Language Models to the World
-            [[Paper]](https://arxiv.org/abs/2306.14824) [[Code]](https://github.com/microsoft/unilm/blob/master/kosmos-2)
             """))
         with gr.Row():
             with gr.Column():
@@ -265,21 +247,6 @@ def main():
                                     show_legend=True,
                                 ).style(color_map=color_map)
-        with gr.Row():
-            with gr.Column():
-                gr.Examples(examples=[
-                            ["images/two_dogs.jpg", "Detailed"],
-                            ["images/snowman.png", "Brief"],
-                            ["images/man_ball.png", "Detailed"],
-                        ], inputs=[image_input, text_input])
-            with gr.Column():
-                gr.Examples(examples=[
-                            ["images/six_planes.png", "Brief"],
-                            ["images/quadrocopter.jpg", "Brief"],
-                            ["images/carnaby_street.jpg", "Brief"],
-                        ], inputs=[image_input, text_input])
-        gr.Markdown(term_of_use)
         # record which text span (label) is selected
         selected = gr.Number(-1, show_label=False, placeholder="Selected", visible=False)
@@ -310,5 +277,4 @@ def main():
 if __name__ == "__main__":
-    main()
-    # trigger

 def main():
+    ckpt = "microsoft/kosmos-2-patch14-224" # mit license
     model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
     processor = AutoProcessor.from_pretrained(ckpt)
     def generate_predictions(image_input, text_input):
         user_image_path = "/tmp/user_input_test_image.jpg"
         image_input.save(user_image_path)
         image_input = Image.open(user_image_path)
         if text_input == "Brief":
             image_embeds=None,
             image_embeds_position_mask=inputs["image_embeds_position_mask"],
             use_cache=True,
+            max_new_tokens=256, #original was 128
         )
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                 # skip bounding bbox without a `phrase` associated
                 continue
             color_id += 1
             entity_info.append(((start, end), color_id))
             filtered_entities.append(entity)
         return annotated_image, colored_text, str(filtered_entities)
+    with gr.Blocks(title="MAGIC Image Captioning", theme='sudeepshouche/minimalist').queue() as demo:
         gr.Markdown(("""
+            # Image description with Entity Recognition
             """))
         with gr.Row():
             with gr.Column():
                                     show_legend=True,
                                 ).style(color_map=color_map)
         # record which text span (label) is selected
         selected = gr.Number(-1, show_label=False, placeholder="Selected", visible=False)
 if __name__ == "__main__":
+    main()