Spaces:

tuandunghcmut
/

corgi-qwen3-vl-demo

Runtime error

App Files Files Community

dung-vpt-uney commited on 22 days ago

Commit

0ad7cf7

1 Parent(s): 799282e

Deploy latest CoRGI Gradio demo

Browse files

Files changed (3) hide show

PROGRESS_LOG.md +1 -0
corgi/__pycache__/gradio_app.cpython-313.pyc +0 -0
corgi/gradio_app.py +167 -20

PROGRESS_LOG.md CHANGED Viewed

@@ -12,6 +12,7 @@
 - Updated `app.py` to fall back to `demo.queue()` when `concurrency_count` is unsupported, fixing the runtime error seen on Spaces.
 - Added ZeroGPU support: cached model/processor globals live on CUDA when available, a `@spaces.GPU`-decorated executor handles pipeline runs, and requirements now include the `spaces` SDK.
 - Introduced structured logging for the app (`app.py`) and pipeline execution to trace model loads, cache hits, and Gradio lifecycle events on Spaces.
 ## 2024-10-21
 - Updated default checkpoints to `Qwen/Qwen3-VL-8B-Thinking` and verified CLI/Gradio/test coverage.

 - Updated `app.py` to fall back to `demo.queue()` when `concurrency_count` is unsupported, fixing the runtime error seen on Spaces.
 - Added ZeroGPU support: cached model/processor globals live on CUDA when available, a `@spaces.GPU`-decorated executor handles pipeline runs, and requirements now include the `spaces` SDK.
 - Introduced structured logging for the app (`app.py`) and pipeline execution to trace model loads, cache hits, and Gradio lifecycle events on Spaces.
+- Reworked the Gradio UI to show per-step panels with annotated evidence galleries, giving each CoRGI reasoning step its own window alongside the final synthesized answer.
 ## 2024-10-21
 - Updated default checkpoints to `Qwen/Qwen3-VL-8B-Thinking` and verified CLI/Gradio/test coverage.

corgi/__pycache__/gradio_app.cpython-313.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/gradio_app.cpython-313.pyc and b/corgi/__pycache__/gradio_app.cpython-313.pyc differ

corgi/gradio_app.py CHANGED Viewed

@@ -2,18 +2,19 @@ from __future__ import annotations
 import logging
 from dataclasses import dataclass
-from typing import Callable, Optional
 try:
     import spaces  # type: ignore
 except ImportError:  # pragma: no cover - spaces library only on HF Spaces
     spaces = None  # type: ignore
-from PIL import Image
 from .cli import DEFAULT_MODEL_ID
 from .pipeline import CoRGIPipeline, PipelineResult
 from .qwen_client import Qwen3VLClient, QwenGenerationConfig
 @dataclass
@@ -26,6 +27,22 @@ _PIPELINE_CACHE: dict[str, CoRGIPipeline] = {}
 _GLOBAL_FACTORY: Callable[[Optional[str]], CoRGIPipeline] | None = None
 logger = logging.getLogger("corgi.gradio_app")
 def _default_factory(model_id: Optional[str]) -> CoRGIPipeline:
     config = QwenGenerationConfig(model_id=model_id or DEFAULT_MODEL_ID)
@@ -59,13 +76,107 @@ def _execute_pipeline(
         max_regions,
     )
     return pipeline.run(
-        image=image.convert("RGB"),
         question=question,
         max_steps=max_steps,
         max_regions=max_regions,
     )
 if spaces is not None:
     @spaces.GPU  # type: ignore[attr-defined]
@@ -137,23 +248,38 @@ def _run_pipeline(
     max_steps: int,
     max_regions: int,
     model_id: Optional[str],
-) -> tuple[PipelineState, str]:
     if image is None:
-        return state or PipelineState(model_id=model_id or DEFAULT_MODEL_ID, pipeline=None), "Please provide an image before running the demo."
     if not question.strip():
-        return state or PipelineState(model_id=model_id or DEFAULT_MODEL_ID, pipeline=None), "Please enter a question before running the demo."
-    target_model = (model_id or DEFAULT_MODEL_ID).strip() or DEFAULT_MODEL_ID
     logger.info("Received request for model_id=%s", target_model)
-    result = _execute_pipeline_gpu(
-        image=image.convert("RGB"),
-        question=question.strip(),
-        max_steps=int(max_steps),
-        max_regions=int(max_regions),
-        model_id=target_model,
-    )
     new_state = PipelineState(model_id=target_model, pipeline=_PIPELINE_CACHE.get(target_model))
-    markdown = format_result_markdown(result)
-    return new_state, markdown
 def build_demo(
@@ -198,11 +324,24 @@ def build_demo(
                 run_button = gr.Button("Run CoRGI")
             with gr.Column(scale=1, min_width=320):
-                result_markdown = gr.Markdown(value="Upload an image and ask a question to begin.")
         def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
             pipeline_state = state_data if isinstance(state_data, PipelineState) else None
-            new_state, markdown = _run_pipeline(
                 pipeline_state,
                 image,
                 question,
@@ -210,12 +349,20 @@ def build_demo(
                 int(max_regions),
                 model_id if model_id else None,
             )
-            return new_state, markdown
         run_button.click(
             fn=_on_submit,
             inputs=[state, image_input, question_input, model_id_input, max_steps_slider, max_regions_slider],
-            outputs=[state, result_markdown],
         )
     return demo

 import logging
 from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple
 try:
     import spaces  # type: ignore
 except ImportError:  # pragma: no cover - spaces library only on HF Spaces
     spaces = None  # type: ignore
+from PIL import Image, ImageDraw
 from .cli import DEFAULT_MODEL_ID
 from .pipeline import CoRGIPipeline, PipelineResult
 from .qwen_client import Qwen3VLClient, QwenGenerationConfig
+from .types import GroundedEvidence
 @dataclass
 _GLOBAL_FACTORY: Callable[[Optional[str]], CoRGIPipeline] | None = None
 logger = logging.getLogger("corgi.gradio_app")
+MAX_UI_STEPS = 6
+GALLERY_MAX_DIM = 768
+EVIDENCE_COLORS: Tuple[Tuple[int, int, int], ...] = (
+    (244, 67, 54),   # red
+    (255, 193, 7),   # amber
+    (76, 175, 80),   # green
+    (33, 150, 243),  # blue
+    (156, 39, 176),  # purple
+    (255, 87, 34),   # deep orange
+)
+try:
+    _THUMBNAIL_RESAMPLE = Image.Resampling.LANCZOS  # type: ignore[attr-defined]
+except AttributeError:  # pragma: no cover - Pillow < 9.1
+    _THUMBNAIL_RESAMPLE = Image.LANCZOS  # type: ignore
 def _default_factory(model_id: Optional[str]) -> CoRGIPipeline:
     config = QwenGenerationConfig(model_id=model_id or DEFAULT_MODEL_ID)
         max_regions,
     )
     return pipeline.run(
+        image=image,
         question=question,
         max_steps=max_steps,
         max_regions=max_regions,
     )
+def _group_evidence_by_step(evidences: List[GroundedEvidence]) -> Dict[int, List[GroundedEvidence]]:
+    grouped: Dict[int, List[GroundedEvidence]] = {}
+    for ev in evidences:
+        grouped.setdefault(ev.step_index, []).append(ev)
+    return grouped
+def _format_evidence_caption(evidence: GroundedEvidence) -> str:
+    bbox_str = ", ".join(f"{coord:.2f}" for coord in evidence.bbox)
+    parts = [f"Step {evidence.step_index}"]
+    if evidence.description:
+        parts.append(evidence.description)
+    if evidence.confidence is not None:
+        parts.append(f"Confidence: {evidence.confidence:.2f}")
+    parts.append(f"BBox: ({bbox_str})")
+    return "\n".join(parts)
+def _annotate_evidence_image(
+    image: Image.Image,
+    evidence: GroundedEvidence,
+    color: Tuple[int, int, int],
+) -> Image.Image:
+    base = image.copy().convert("RGBA")
+    overlay = Image.new("RGBA", base.size, (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    width, height = base.size
+    x1 = max(0, min(int(evidence.bbox[0] * width), width - 1))
+    y1 = max(0, min(int(evidence.bbox[1] * height), height - 1))
+    x2 = max(0, min(int(evidence.bbox[2] * width), width - 1))
+    y2 = max(0, min(int(evidence.bbox[3] * height), height - 1))
+    x1, x2 = sorted((x1, x2))
+    y1, y2 = sorted((y1, y2))
+    outline_width = max(2, int(min(width, height) * 0.005))
+    rgba_color = color + (255,)
+    fill_color = color + (64,)
+    draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=rgba_color, width=outline_width)
+    annotated = Image.alpha_composite(base, overlay).convert("RGB")
+    if max(annotated.size) > GALLERY_MAX_DIM:
+        annotated.thumbnail((GALLERY_MAX_DIM, GALLERY_MAX_DIM), _THUMBNAIL_RESAMPLE)
+    return annotated
+def _empty_ui_payload(message: str, max_slots: int = MAX_UI_STEPS) -> Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]:
+    return (
+        f"### Final Answer\n{message}",
+        ["_No step data available._" for _ in range(max_slots)],
+        [[] for _ in range(max_slots)],
+    )
+def _prepare_ui_payload(
+    image: Image.Image,
+    result: PipelineResult,
+    max_slots: int = MAX_UI_STEPS,
+) -> Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]:
+    answer_text = f"### Final Answer\n{result.answer or '(no answer returned)'}"
+    evidences_by_step = _group_evidence_by_step(result.evidence)
+    step_markdowns: List[str] = []
+    step_galleries: List[List[Tuple[Image.Image, str]]] = []
+    for slot_index in range(max_slots):
+        if slot_index < len(result.steps):
+            step = result.steps[slot_index]
+            evidences = evidences_by_step.get(step.index, [])
+            lines = [
+                f"**Step {step.index}:** {step.statement}",
+                f"- Needs vision: {'yes' if step.needs_vision else 'no'}",
+            ]
+            if step.reason:
+                lines.append(f"- Reason: {step.reason}")
+            if evidences:
+                lines.append(f"- Evidence items: {len(evidences)} (see gallery below)")
+            else:
+                lines.append("- No visual evidence returned for this step.")
+            step_markdowns.append("\n".join(lines))
+            gallery_entries: List[Tuple[Image.Image, str]] = []
+            for idx, evidence in enumerate(evidences):
+                color = EVIDENCE_COLORS[idx % len(EVIDENCE_COLORS)]
+                annotated = _annotate_evidence_image(image, evidence, color)
+                gallery_entries.append((annotated, _format_evidence_caption(evidence)))
+            step_galleries.append(gallery_entries)
+        else:
+            step_markdowns.append("_No step data returned._")
+            step_galleries.append([])
+    return answer_text, step_markdowns, step_galleries
 if spaces is not None:
     @spaces.GPU  # type: ignore[attr-defined]
     max_steps: int,
     max_regions: int,
     model_id: Optional[str],
+) -> tuple[PipelineState, Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]]:
+    target_model = (model_id or DEFAULT_MODEL_ID).strip() or DEFAULT_MODEL_ID
+    cached_pipeline = _PIPELINE_CACHE.get(target_model)
+    base_state = state or PipelineState(model_id=target_model, pipeline=cached_pipeline)
     if image is None:
+        logger.info("Request skipped: no image provided.")
+        return base_state, _empty_ui_payload("Please provide an image before running the demo.")
     if not question.strip():
+        logger.info("Request skipped: question empty.")
+        return base_state, _empty_ui_payload("Please enter a question before running the demo.")
     logger.info("Received request for model_id=%s", target_model)
+    rgb_image = image.convert("RGB")
+    try:
+        result = _execute_pipeline_gpu(
+            image=rgb_image,
+            question=question.strip(),
+            max_steps=int(max_steps),
+            max_regions=int(max_regions),
+            model_id=target_model,
+        )
+    except Exception as exc:  # pragma: no cover - defensive error handling
+        logger.exception("Pipeline execution failed: %s", exc)
+        return PipelineState(model_id=target_model, pipeline=_PIPELINE_CACHE.get(target_model)), _empty_ui_payload(
+            f"Pipeline error: {exc}"
+        )
     new_state = PipelineState(model_id=target_model, pipeline=_PIPELINE_CACHE.get(target_model))
+    payload = _prepare_ui_payload(rgb_image, result, MAX_UI_STEPS)
+    return new_state, payload
 def build_demo(
                 run_button = gr.Button("Run CoRGI")
             with gr.Column(scale=1, min_width=320):
+                answer_markdown = gr.Markdown(value="### Final Answer\nUpload an image and ask a question to begin.")
+                step_markdown_components: List["gr.Markdown"] = []
+                step_gallery_components: List["gr.Gallery"] = []
+                for slot in range(MAX_UI_STEPS):
+                    with gr.Accordion(f"Step {slot + 1}", open=False):
+                        step_md = gr.Markdown("_No step data available._")
+                        gallery = gr.Gallery(
+                            label="Visual evidence",
+                            columns=2,
+                            height=280,
+                            allow_preview=True,
+                        )
+                    step_markdown_components.append(step_md)
+                    step_gallery_components.append(gallery)
         def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
             pipeline_state = state_data if isinstance(state_data, PipelineState) else None
+            new_state, payload = _run_pipeline(
                 pipeline_state,
                 image,
                 question,
                 int(max_regions),
                 model_id if model_id else None,
             )
+            answer_text, step_texts, gallery_payloads = payload
+            outputs: List[object] = [new_state, answer_text]
+            outputs.extend(step_texts)
+            outputs.extend(gallery_payloads)
+            return outputs
+        output_components = [state, answer_markdown]
+        output_components.extend(step_markdown_components)
+        output_components.extend(step_gallery_components)
         run_button.click(
             fn=_on_submit,
             inputs=[state, image_input, question_input, model_id_input, max_steps_slider, max_regions_slider],
+            outputs=output_components,
         )
     return demo