rodrigomasini commited on
Commit
b03699f
·
verified ·
1 Parent(s): 02572c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -39
app.py CHANGED
@@ -165,18 +165,15 @@ def draw_entity_boxes_on_image(image, entities, show=False, save_path=None, enti
165
 
166
  def main():
167
 
168
- ckpt = "microsoft/kosmos-2-patch14-224"
169
 
170
  model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
171
  processor = AutoProcessor.from_pretrained(ckpt)
172
 
173
  def generate_predictions(image_input, text_input):
174
 
175
- # Save the image and load it again to match the original Kosmos-2 demo.
176
- # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
177
  user_image_path = "/tmp/user_input_test_image.jpg"
178
  image_input.save(user_image_path)
179
- # This might give different results from the original argument `image_input`
180
  image_input = Image.open(user_image_path)
181
 
182
  if text_input == "Brief":
@@ -195,7 +192,7 @@ def main():
195
  image_embeds=None,
196
  image_embeds_position_mask=inputs["image_embeds_position_mask"],
197
  use_cache=True,
198
- max_new_tokens=128,
199
  )
200
 
201
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -214,9 +211,6 @@ def main():
214
  # skip bounding bbox without a `phrase` associated
215
  continue
216
  color_id += 1
217
- # for bbox_id, _ in enumerate(bboxes):
218
- # if start is None and bbox_id > 0:
219
- # color_id += 1
220
  entity_info.append(((start, end), color_id))
221
  filtered_entities.append(entity)
222
 
@@ -234,21 +228,9 @@ def main():
234
 
235
  return annotated_image, colored_text, str(filtered_entities)
236
 
237
- term_of_use = """
238
- ### Terms of use
239
- By using this model, users are required to agree to the following terms:
240
- The model is intended for academic and research purposes.
241
- The utilization of the model to create unsuitable material is strictly forbidden and not endorsed by this work.
242
- The accountability for any improper or unacceptable application of the model rests exclusively with the individuals who generated such content.
243
-
244
- ### License
245
- This project is licensed under the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct).
246
- """
247
-
248
- with gr.Blocks(title="Kosmos-2", theme=gr.themes.Base()).queue() as demo:
249
  gr.Markdown(("""
250
- # Kosmos-2: Grounding Multimodal Large Language Models to the World
251
- [[Paper]](https://arxiv.org/abs/2306.14824) [[Code]](https://github.com/microsoft/unilm/blob/master/kosmos-2)
252
  """))
253
  with gr.Row():
254
  with gr.Column():
@@ -265,21 +247,6 @@ def main():
265
  show_legend=True,
266
  ).style(color_map=color_map)
267
 
268
- with gr.Row():
269
- with gr.Column():
270
- gr.Examples(examples=[
271
- ["images/two_dogs.jpg", "Detailed"],
272
- ["images/snowman.png", "Brief"],
273
- ["images/man_ball.png", "Detailed"],
274
- ], inputs=[image_input, text_input])
275
- with gr.Column():
276
- gr.Examples(examples=[
277
- ["images/six_planes.png", "Brief"],
278
- ["images/quadrocopter.jpg", "Brief"],
279
- ["images/carnaby_street.jpg", "Brief"],
280
- ], inputs=[image_input, text_input])
281
- gr.Markdown(term_of_use)
282
-
283
  # record which text span (label) is selected
284
  selected = gr.Number(-1, show_label=False, placeholder="Selected", visible=False)
285
 
@@ -310,5 +277,4 @@ def main():
310
 
311
 
312
  if __name__ == "__main__":
313
- main()
314
- # trigger
 
165
 
166
  def main():
167
 
168
+ ckpt = "microsoft/kosmos-2-patch14-224" # mit license
169
 
170
  model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
171
  processor = AutoProcessor.from_pretrained(ckpt)
172
 
173
  def generate_predictions(image_input, text_input):
174
 
 
 
175
  user_image_path = "/tmp/user_input_test_image.jpg"
176
  image_input.save(user_image_path)
 
177
  image_input = Image.open(user_image_path)
178
 
179
  if text_input == "Brief":
 
192
  image_embeds=None,
193
  image_embeds_position_mask=inputs["image_embeds_position_mask"],
194
  use_cache=True,
195
+ max_new_tokens=256, #original was 128
196
  )
197
 
198
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
211
  # skip bounding bbox without a `phrase` associated
212
  continue
213
  color_id += 1
 
 
 
214
  entity_info.append(((start, end), color_id))
215
  filtered_entities.append(entity)
216
 
 
228
 
229
  return annotated_image, colored_text, str(filtered_entities)
230
 
231
+ with gr.Blocks(title="MAGIC Image Captioning", theme='sudeepshouche/minimalist').queue() as demo:
 
 
 
 
 
 
 
 
 
 
 
232
  gr.Markdown(("""
233
+ # Image description with Entity Recognition
 
234
  """))
235
  with gr.Row():
236
  with gr.Column():
 
247
  show_legend=True,
248
  ).style(color_map=color_map)
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  # record which text span (label) is selected
251
  selected = gr.Number(-1, show_label=False, placeholder="Selected", visible=False)
252
 
 
277
 
278
 
279
  if __name__ == "__main__":
280
+ main()