Spaces:

nielsr
/

kosmos-2.5-demo

Running on Zero

App Files Files Community

prithivMLmods commited on 11 days ago

Commit

f2254d0

verified ·

1 Parent(s): dce53e9

rm lazy-loading : preloaded at startup

Browse files

Files changed (1) hide show

app.py +66 -128

app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import spaces
 import torch
 import gradio as gr
-from PIL import Image
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
 import re
-# Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-# Check if Flash Attention 2 is available
 def is_flash_attention_available():
     try:
         import flash_attn
@@ -17,63 +15,33 @@ def is_flash_attention_available():
     except ImportError:
         return False
-# Initialize models and processors lazily
-base_model = None
-base_processor = None
-chat_model = None
-chat_processor = None
-def load_base_model():
-    global base_model, base_processor
-    if base_model is None:
-        base_repo = "microsoft/kosmos-2.5"
-        # Use Flash Attention 2 if available, otherwise use default attention
-        model_kwargs = {
-            "device_map": "cuda",
-            "dtype": dtype,
-        }
-        if is_flash_attention_available():
-            model_kwargs["attn_implementation"] = "flash_attention_2"
-        base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
-            base_repo,
-            **model_kwargs
-        )
-        base_processor = AutoProcessor.from_pretrained(base_repo)
-    return base_model, base_processor
-def load_chat_model():
-    global chat_model, chat_processor
-    if chat_model is None:
-        chat_repo = "microsoft/kosmos-2.5-chat"
-        # Use Flash Attention 2 if available, otherwise use default attention
-        model_kwargs = {
-            "device_map": "cuda",
-            "dtype": dtype,
-        }
-        if is_flash_attention_available():
-            model_kwargs["attn_implementation"] = "flash_attention_2"
-        chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
-            chat_repo,
-            **model_kwargs
-        )
-        chat_processor = AutoProcessor.from_pretrained(chat_repo)
-    return chat_model, chat_processor
 def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
     y = y.replace(prompt, "")
     if "<md>" in prompt:
         return y
     pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
     bboxs_raw = re.findall(pattern, y)
     lines = re.split(pattern, y)[1:]
     bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
     bboxs = [[int(j) for j in i] for i in bboxs]
     info = ""
     for i in range(len(lines)):
         if i < len(bboxs):
@@ -91,65 +59,58 @@ def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
 def generate_markdown(image):
     if image is None:
         return "Please upload an image."
-    model, processor = load_base_model()
     prompt = "<md>"
-    inputs = processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     scale_height = raw_height / height
     scale_width = raw_width / width
     inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
-        generated_ids = model.generate(
             **inputs,
             max_new_tokens=1024,
         )
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
     result = generated_text[0].replace(prompt, "").strip()
     return result
 @spaces.GPU(duration=120)
 def generate_ocr(image):
     if image is None:
         return "Please upload an image.", None
-    model, processor = load_base_model()
     prompt = "<ocr>"
-    inputs = processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     scale_height = raw_height / height
     scale_width = raw_width / width
     inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
-        generated_ids = model.generate(
             **inputs,
             max_new_tokens=1024,
         )
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    # Post-process OCR output
     output_text = post_process_ocr(generated_text[0], scale_height, scale_width)
-    # Create visualization
-    from PIL import ImageDraw
     vis_image = image.copy()
     draw = ImageDraw.Draw(vis_image)
     lines = output_text.split("\n")
     for line in lines:
         if not line.strip():
@@ -161,7 +122,7 @@ def generate_ocr(image):
                 draw.polygon(coords, outline="red", width=2)
             except:
                 continue
     return output_text, vis_image
 @spaces.GPU(duration=120)
@@ -170,54 +131,49 @@ def generate_chat_response(image, question):
         return "Please upload an image."
     if not question.strip():
         return "Please ask a question."
-    model, processor = load_chat_model()
     template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
     prompt = template.format(question)
-    inputs = processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     scale_height = raw_height / height
     scale_width = raw_width / width
     inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
-        generated_ids = model.generate(
             **inputs,
             max_new_tokens=1024,
         )
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    # Extract only the assistant's response
     result = generated_text[0]
     if "ASSISTANT:" in result:
         result = result.split("ASSISTANT:")[-1].strip()
     return result
-# Create Gradio interface
 with gr.Blocks(title="KOSMOS-2.5 Document AI Demo", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # KOSMOS-2.5 Document AI Demo
     Explore Microsoft's KOSMOS-2.5, a multimodal model for reading text-intensive images!
     This demo showcases three capabilities:
     1. **Markdown Generation**: Convert document images to markdown format
     2. **OCR with Bounding Boxes**: Extract text with spatial coordinates
     3. **Document Q&A**: Ask questions about document content using KOSMOS-2.5 Chat
     Upload a document image (receipt, form, article, etc.) and try different tasks!
     """)
     with gr.Tabs():
-        # Markdown Generation Tab
         with gr.TabItem("📝 Markdown Generation"):
             with gr.Row():
                 with gr.Column():
@@ -229,13 +185,12 @@ with gr.Blocks(title="KOSMOS-2.5 Document AI Demo", theme=gr.themes.Soft()) as d
                     md_button = gr.Button("Generate Markdown", variant="primary")
                 with gr.Column():
                     md_output = gr.Textbox(
-                        label="Generated Markdown",
-                        lines=15,
                         max_lines=20,
                         show_copy_button=True
                     )
-        # OCR Tab
         with gr.TabItem("🔍 OCR with Bounding Boxes"):
             with gr.Row():
                 with gr.Column():
@@ -248,13 +203,12 @@ with gr.Blocks(title="KOSMOS-2.5 Document AI Demo", theme=gr.themes.Soft()) as d
                 with gr.Column():
                     with gr.Row():
                         ocr_text = gr.Textbox(
-                            label="Extracted Text with Coordinates",
                             lines=10,
                             show_copy_button=True
                         )
                         ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)")
-        # Chat Tab
         with gr.TabItem("💬 Document Q&A (Chat)"):
             with gr.Row():
                 with gr.Column():
@@ -275,38 +229,22 @@ with gr.Blocks(title="KOSMOS-2.5 Document AI Demo", theme=gr.themes.Soft()) as d
                     chat_button = gr.Button("Get Answer", variant="primary")
                 with gr.Column():
                     chat_output = gr.Textbox(
-                        label="Answer",
                         lines=8,
                         show_copy_button=True
                     )
-    # Event handlers
-    md_button.click(
-        fn=generate_markdown,
-        inputs=[md_image],
-        outputs=[md_output]
-    )
-    ocr_button.click(
-        fn=generate_ocr,
-        inputs=[ocr_image],
-        outputs=[ocr_text, ocr_vis]
-    )
-    chat_button.click(
-        fn=generate_chat_response,
-        inputs=[chat_image, chat_question],
-        outputs=[chat_output]
-    )
-    # Examples section
     gr.Markdown("""
     ## Example Use Cases:
     - **Receipts**: Extract itemized information or ask about totals
     - **Forms**: Convert to structured format or answer specific questions
     - **Articles**: Get markdown format or ask about content
     - **Screenshots**: Extract text or get information about specific elements
     ## Note:
     This is a generative model and may occasionally hallucinate. Results should be verified for accuracy.
     """)

 import spaces
 import torch
 import gradio as gr
+from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
 import re
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 def is_flash_attention_available():
     try:
         import flash_attn
     except ImportError:
         return False
+# Load models once at startup
+base_repo = "microsoft/kosmos-2.5"
+chat_repo = "microsoft/kosmos-2.5-chat"
+model_kwargs = {
+    "device_map": "cuda",
+    "dtype": dtype,
+}
+if is_flash_attention_available():
+    model_kwargs["attn_implementation"] = "flash_attention_2"
+base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(base_repo, **model_kwargs)
+base_processor = AutoProcessor.from_pretrained(base_repo)
+chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(chat_repo, **model_kwargs)
+chat_processor = AutoProcessor.from_pretrained(chat_repo)
 def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
     y = y.replace(prompt, "")
     if "<md>" in prompt:
         return y
     pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
     bboxs_raw = re.findall(pattern, y)
     lines = re.split(pattern, y)[1:]
     bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
     bboxs = [[int(j) for j in i] for i in bboxs]
     info = ""
     for i in range(len(lines)):
         if i < len(bboxs):
 def generate_markdown(image):
     if image is None:
         return "Please upload an image."
     prompt = "<md>"
+    inputs = base_processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     scale_height = raw_height / height
     scale_width = raw_width / width
     inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
+        generated_ids = base_model.generate(
             **inputs,
             max_new_tokens=1024,
         )
+    generated_text = base_processor.batch_decode(generated_ids, skip_special_tokens=True)
     result = generated_text[0].replace(prompt, "").strip()
     return result
 @spaces.GPU(duration=120)
 def generate_ocr(image):
     if image is None:
         return "Please upload an image.", None
     prompt = "<ocr>"
+    inputs = base_processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     scale_height = raw_height / height
     scale_width = raw_width / width
     inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
+        generated_ids = base_model.generate(
             **inputs,
             max_new_tokens=1024,
         )
+    generated_text = base_processor.batch_decode(generated_ids, skip_special_tokens=True)
     output_text = post_process_ocr(generated_text[0], scale_height, scale_width)
     vis_image = image.copy()
     draw = ImageDraw.Draw(vis_image)
     lines = output_text.split("\n")
     for line in lines:
         if not line.strip():
                 draw.polygon(coords, outline="red", width=2)
             except:
                 continue
     return output_text, vis_image
 @spaces.GPU(duration=120)
         return "Please upload an image."
     if not question.strip():
         return "Please ask a question."
     template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
     prompt = template.format(question)
+    inputs = chat_processor(text=prompt, images=image, return_tensors="pt")
     height, width = inputs.pop("height"), inputs.pop("width")
     raw_width, raw_height = image.size
     scale_height = raw_height / height
     scale_width = raw_width / width
     inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
     inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
     with torch.no_grad():
+        generated_ids = chat_model.generate(
             **inputs,
             max_new_tokens=1024,
         )
+    generated_text = chat_processor.batch_decode(generated_ids, skip_special_tokens=True)
     result = generated_text[0]
     if "ASSISTANT:" in result:
         result = result.split("ASSISTANT:")[-1].strip()
     return result
 with gr.Blocks(title="KOSMOS-2.5 Document AI Demo", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # KOSMOS-2.5 Document AI Demo
     Explore Microsoft's KOSMOS-2.5, a multimodal model for reading text-intensive images!
     This demo showcases three capabilities:
     1. **Markdown Generation**: Convert document images to markdown format
     2. **OCR with Bounding Boxes**: Extract text with spatial coordinates
     3. **Document Q&A**: Ask questions about document content using KOSMOS-2.5 Chat
     Upload a document image (receipt, form, article, etc.) and try different tasks!
     """)
     with gr.Tabs():
         with gr.TabItem("📝 Markdown Generation"):
             with gr.Row():
                 with gr.Column():
                     md_button = gr.Button("Generate Markdown", variant="primary")
                 with gr.Column():
                     md_output = gr.Textbox(
+                        label="Generated Markdown",
+                        lines=15,
                         max_lines=20,
                         show_copy_button=True
                     )
         with gr.TabItem("🔍 OCR with Bounding Boxes"):
             with gr.Row():
                 with gr.Column():
                 with gr.Column():
                     with gr.Row():
                         ocr_text = gr.Textbox(
+                            label="Extracted Text with Coordinates",
                             lines=10,
                             show_copy_button=True
                         )
                         ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)")
         with gr.TabItem("💬 Document Q&A (Chat)"):
             with gr.Row():
                 with gr.Column():
                     chat_button = gr.Button("Get Answer", variant="primary")
                 with gr.Column():
                     chat_output = gr.Textbox(
+                        label="Answer",
                         lines=8,
                         show_copy_button=True
                     )
+    md_button.click(fn=generate_markdown, inputs=[md_image], outputs=[md_output])
+    ocr_button.click(fn=generate_ocr, inputs=[ocr_image], outputs=[ocr_text, ocr_vis])
+    chat_button.click(fn=generate_chat_response, inputs=[chat_image, chat_question], outputs=[chat_output])
     gr.Markdown("""
     ## Example Use Cases:
     - **Receipts**: Extract itemized information or ask about totals
     - **Forms**: Convert to structured format or answer specific questions
     - **Articles**: Get markdown format or ask about content
     - **Screenshots**: Extract text or get information about specific elements
     ## Note:
     This is a generative model and may occasionally hallucinate. Results should be verified for accuracy.
     """)