Spaces:

tuandunghcmut
/

corgi-qwen3-vl-demo

Runtime error

App Files Files Community

dung-vpt-uney commited on Oct 21

Commit

fe542a6

1 Parent(s): b02dcfa

Deploy latest CoRGI Gradio demo

Browse files

Files changed (10) hide show

README.md +7 -0
corgi/__pycache__/gradio_app.cpython-313.pyc +0 -0
corgi/__pycache__/pipeline.cpython-313.pyc +0 -0
corgi/__pycache__/qwen_client.cpython-313.pyc +0 -0
corgi/__pycache__/types.cpython-313.pyc +0 -0
corgi/cli.py +1 -1
corgi/gradio_app.py +173 -57
corgi/pipeline.py +38 -4
corgi/qwen_client.py +24 -1
corgi/types.py +26 -0

README.md CHANGED Viewed

@@ -40,3 +40,10 @@ python app.py
 - The Space queues requests sequentially on `cpu-basic` (ZeroGPU) hardware.
 - Set the `CORGI_QWEN_MODEL` environment variable to try another Qwen3-VL checkpoint (for example, `Qwen/Qwen3-VL-4B-Instruct`).
 - `max_steps` and `max_regions` sliders control how many reasoning steps and ROI candidates the model returns.

 - The Space queues requests sequentially on `cpu-basic` (ZeroGPU) hardware.
 - Set the `CORGI_QWEN_MODEL` environment variable to try another Qwen3-VL checkpoint (for example, `Qwen/Qwen3-VL-4B-Instruct`).
 - `max_steps` and `max_regions` sliders control how many reasoning steps and ROI candidates the model returns.
+## UI Overview
+- **Chain of Thought**: Displays the structured reasoning steps with vision flags, alongside the exact prompt/response sent to the model.
+- **ROI Extraction**: Shows the source image with every grounded bounding box plus per-evidence crops, and lists the prompts used for each verification step.
+- **Evidence Descriptions**: Summarises each grounded region (bbox, description, confidence) with the associated ROI prompts.
+- **Answer Synthesis**: Highlights the final answer, supporting context, and the synthesis prompt/response pair.

corgi/__pycache__/gradio_app.cpython-313.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/gradio_app.cpython-313.pyc and b/corgi/__pycache__/gradio_app.cpython-313.pyc differ

corgi/__pycache__/pipeline.cpython-313.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/pipeline.cpython-313.pyc and b/corgi/__pycache__/pipeline.cpython-313.pyc differ

corgi/__pycache__/qwen_client.cpython-313.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/qwen_client.cpython-313.pyc and b/corgi/__pycache__/qwen_client.cpython-313.pyc differ

corgi/__pycache__/types.cpython-313.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/types.cpython-313.pyc and b/corgi/__pycache__/types.cpython-313.pyc differ

corgi/cli.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .pipeline import CoRGIPipeline
 from .qwen_client import Qwen3VLClient, QwenGenerationConfig
 from .types import GroundedEvidence, ReasoningStep
-DEFAULT_MODEL_ID = "Qwen/Qwen3-VL-4B-Thinking"
 def build_parser() -> argparse.ArgumentParser:

 from .qwen_client import Qwen3VLClient, QwenGenerationConfig
 from .types import GroundedEvidence, ReasoningStep
+DEFAULT_MODEL_ID = "Qwen/Qwen3-VL-8B-Thinking"
 def build_parser() -> argparse.ArgumentParser:

corgi/gradio_app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
 from dataclasses import dataclass
 from typing import Callable, Dict, List, Optional, Tuple
@@ -14,7 +15,7 @@ from PIL import Image, ImageDraw
 from .cli import DEFAULT_MODEL_ID
 from .pipeline import CoRGIPipeline, PipelineResult
 from .qwen_client import Qwen3VLClient, QwenGenerationConfig
-from .types import GroundedEvidence
 @dataclass
@@ -129,52 +130,143 @@ def _annotate_evidence_image(
     return annotated
-def _empty_ui_payload(message: str, max_slots: int = MAX_UI_STEPS) -> Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]:
-    return (
-        f"### Final Answer\n{message}",
-        ["_No step data available._" for _ in range(max_slots)],
-        [[] for _ in range(max_slots)],
-    )
 def _prepare_ui_payload(
     image: Image.Image,
     result: PipelineResult,
     max_slots: int = MAX_UI_STEPS,
-) -> Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]:
     answer_text = f"### Final Answer\n{result.answer or '(no answer returned)'}"
-    evidences_by_step = _group_evidence_by_step(result.evidence)
-    step_markdowns: List[str] = []
-    step_galleries: List[List[Tuple[Image.Image, str]]] = []
-    for slot_index in range(max_slots):
-        if slot_index < len(result.steps):
-            step = result.steps[slot_index]
-            evidences = evidences_by_step.get(step.index, [])
-            lines = [
-                f"**Step {step.index}:** {step.statement}",
-                f"- Needs vision: {'yes' if step.needs_vision else 'no'}",
-            ]
-            if step.reason:
-                lines.append(f"- Reason: {step.reason}")
-            if evidences:
-                lines.append(f"- Evidence items: {len(evidences)} (see gallery below)")
-            else:
-                lines.append("- No visual evidence returned for this step.")
-            step_markdowns.append("\n".join(lines))
-            gallery_entries: List[Tuple[Image.Image, str]] = []
-            for idx, evidence in enumerate(evidences):
-                color = EVIDENCE_COLORS[idx % len(EVIDENCE_COLORS)]
-                annotated = _annotate_evidence_image(image, evidence, color)
-                gallery_entries.append((annotated, _format_evidence_caption(evidence)))
-            step_galleries.append(gallery_entries)
         else:
-            step_markdowns.append("_No step data returned._")
-            step_galleries.append([])
-    return answer_text, step_markdowns, step_galleries
 if spaces is not None:
@@ -248,7 +340,7 @@ def _run_pipeline(
     max_steps: int,
     max_regions: int,
     model_id: Optional[str],
-) -> tuple[PipelineState, Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]]:
     target_model = (model_id or DEFAULT_MODEL_ID).strip() or DEFAULT_MODEL_ID
     cached_pipeline = _PIPELINE_CACHE.get(target_model)
     base_state = state or PipelineState(model_id=target_model, pipeline=cached_pipeline)
@@ -325,19 +417,25 @@ def build_demo(
             with gr.Column(scale=1, min_width=320):
                 answer_markdown = gr.Markdown(value="### Final Answer\nUpload an image and ask a question to begin.")
-                step_markdown_components: List["gr.Markdown"] = []
-                step_gallery_components: List["gr.Gallery"] = []
-                for slot in range(MAX_UI_STEPS):
-                    with gr.Accordion(f"Step {slot + 1}", open=False):
-                        step_md = gr.Markdown("_No step data available._")
-                        gallery = gr.Gallery(
-                            label="Visual evidence",
                             columns=2,
                             height=280,
                             allow_preview=True,
                         )
-                    step_markdown_components.append(step_md)
-                    step_gallery_components.append(gallery)
         def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
             pipeline_state = state_data if isinstance(state_data, PipelineState) else None
@@ -349,15 +447,33 @@ def build_demo(
                 int(max_regions),
                 model_id if model_id else None,
             )
-            answer_text, step_texts, gallery_payloads = payload
-            outputs: List[object] = [new_state, answer_text]
-            outputs.extend(step_texts)
-            outputs.extend(gallery_payloads)
-            return outputs
-        output_components = [state, answer_markdown]
-        output_components.extend(step_markdown_components)
-        output_components.extend(step_gallery_components)
         run_button.click(
             fn=_on_submit,

 from __future__ import annotations
 import logging
+import itertools
 from dataclasses import dataclass
 from typing import Callable, Dict, List, Optional, Tuple
 from .cli import DEFAULT_MODEL_ID
 from .pipeline import CoRGIPipeline, PipelineResult
 from .qwen_client import Qwen3VLClient, QwenGenerationConfig
+from .types import GroundedEvidence, PromptLog
 @dataclass
     return annotated
+def _empty_ui_payload(message: str) -> Dict[str, object]:
+    placeholder_prompt = f"```text\n{message}\n```"
+    return {
+        "answer_markdown": f"### Final Answer\n{message}",
+        "chain_markdown": message,
+        "chain_prompt": placeholder_prompt,
+        "roi_overview": None,
+        "roi_gallery": [],
+        "roi_prompt": placeholder_prompt,
+        "evidence_markdown": message,
+        "evidence_prompt": placeholder_prompt,
+        "answer_process_markdown": message,
+        "answer_prompt": placeholder_prompt,
+    }
+def _annotate_overview_image(image: Image.Image, evidences: List[GroundedEvidence]) -> Optional[Image.Image]:
+    if not evidences:
+        return None
+    base = image.copy().convert("RGBA")
+    overlay = Image.new("RGBA", base.size, (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    width, height = base.size
+    step_colors: Dict[int, Tuple[int, int, int]] = {}
+    color_cycle = itertools.cycle(EVIDENCE_COLORS)
+    for ev in evidences:
+        color = step_colors.setdefault(ev.step_index, next(color_cycle))
+        x1 = max(0, min(int(ev.bbox[0] * width), width - 1))
+        y1 = max(0, min(int(ev.bbox[1] * height), height - 1))
+        x2 = max(0, min(int(ev.bbox[2] * width), width - 1))
+        y2 = max(0, min(int(ev.bbox[3] * height), height - 1))
+        x1, x2 = sorted((x1, x2))
+        y1, y2 = sorted((y1, y2))
+        outline_width = max(2, int(min(width, height) * 0.005))
+        rgba_color = color + (255,)
+        fill_color = color + (60,)
+        draw.rectangle([x1, y1, x2, y2], outline=rgba_color, width=outline_width)
+        label = f"S{ev.step_index}"
+        draw.text((x1 + 4, y1 + 4), label, fill=rgba_color)
+    annotated = Image.alpha_composite(base, overlay).convert("RGB")
+    if max(annotated.size) > GALLERY_MAX_DIM:
+        annotated.thumbnail((GALLERY_MAX_DIM, GALLERY_MAX_DIM), _THUMBNAIL_RESAMPLE)
+    return annotated
+def _format_prompt_markdown(log: Optional[PromptLog], title: str) -> str:
+    if log is None:
+        return f"**{title} Prompt**\n_Prompt unavailable._"
+    lines = [f"**{title} Prompt**", "```text", log.prompt, "```"]
+    if log.response:
+        lines.extend(["**Model Response**", "```text", log.response, "```"])
+    return "\n".join(lines)
+def _format_grounding_prompts(logs: List[PromptLog]) -> str:
+    if not logs:
+        return "_No ROI prompts available._"
+    blocks: List[str] = []
+    for log in logs:
+        heading = f"#### Step {log.step_index}" if log.step_index is not None else "#### ROI Prompt"
+        sections = [heading, "**Prompt**", "```text", log.prompt, "```"]
+        if log.response:
+            sections.extend(["**Model Response**", "```text", log.response, "```"])
+        blocks.append("\n".join(sections))
+    return "\n\n".join(blocks)
 def _prepare_ui_payload(
     image: Image.Image,
     result: PipelineResult,
     max_slots: int = MAX_UI_STEPS,
+) -> Dict[str, object]:
     answer_text = f"### Final Answer\n{result.answer or '(no answer returned)'}"
+    step_lines: List[str] = []
+    evidences_by_step = _group_evidence_by_step(result.evidence)
+    for step in result.steps[:max_slots]:
+        lines = [
+            f"**Step {step.index}:** {step.statement}",
+            f"- Needs vision: {'yes' if step.needs_vision else 'no'}",
+        ]
+        if step.reason:
+            lines.append(f"- Reason: {step.reason}")
+        evs = evidences_by_step.get(step.index, [])
+        if evs:
+            lines.append(f"- Visual evidence items: {len(evs)}")
         else:
+            lines.append("- No visual evidence returned for this step.")
+        step_lines.append("\n".join(lines))
+    if len(result.steps) > max_slots:
+        step_lines.append(f"_Only the first {max_slots} steps are shown._")
+    chain_markdown = "\n\n".join(step_lines) if step_lines else "_No reasoning steps returned._"
+    roi_overview = _annotate_overview_image(image, result.evidence)
+    aggregated_gallery: List[Tuple[Image.Image, str]] = []
+    for idx, evidence in enumerate(result.evidence):
+        color = EVIDENCE_COLORS[idx % len(EVIDENCE_COLORS)]
+        annotated = _annotate_evidence_image(image, evidence, color)
+        aggregated_gallery.append((annotated, _format_evidence_caption(evidence)))
+    evidence_blocks: List[str] = []
+    for idx, evidence in enumerate(result.evidence, start=1):
+        bbox = ", ".join(f"{coord:.2f}" for coord in evidence.bbox)
+        desc = evidence.description or "(no description)"
+        conf = f"Confidence: {evidence.confidence:.2f}" if evidence.confidence is not None else "Confidence: n/a"
+        evidence_blocks.append(
+            f"**Evidence {idx} — Step {evidence.step_index}**\n- {desc}\n- {conf}\n- BBox: ({bbox})"
+        )
+    evidence_markdown = "\n\n".join(evidence_blocks) if evidence_blocks else "_No visual evidence collected._"
+    reasoning_prompt_md = _format_prompt_markdown(result.reasoning_log, "Reasoning")
+    roi_prompt_md = _format_grounding_prompts(result.grounding_logs)
+    evidence_prompt_md = roi_prompt_md if result.grounding_logs else "_No ROI prompts available._"
+    answer_prompt_md = _format_prompt_markdown(result.answer_log, "Answer Synthesis")
+    answer_process_lines = [
+        f"**Question:** {result.question}",
+        f"**Final Answer:** {result.answer or '(no answer returned)'}",
+        f"**Steps considered:** {len(result.steps)}",
+        f"**Visual evidence items:** {len(result.evidence)}",
+    ]
+    answer_process_markdown = "\n".join(answer_process_lines)
+    return {
+        "answer_markdown": answer_text,
+        "chain_markdown": chain_markdown,
+        "chain_prompt": reasoning_prompt_md,
+        "roi_overview": roi_overview,
+        "roi_gallery": aggregated_gallery,
+        "roi_prompt": roi_prompt_md,
+        "evidence_markdown": evidence_markdown,
+        "evidence_prompt": evidence_prompt_md,
+        "answer_process_markdown": answer_process_markdown,
+        "answer_prompt": answer_prompt_md,
+    }
 if spaces is not None:
     max_steps: int,
     max_regions: int,
     model_id: Optional[str],
+) -> tuple[PipelineState, Dict[str, object]]:
     target_model = (model_id or DEFAULT_MODEL_ID).strip() or DEFAULT_MODEL_ID
     cached_pipeline = _PIPELINE_CACHE.get(target_model)
     base_state = state or PipelineState(model_id=target_model, pipeline=cached_pipeline)
             with gr.Column(scale=1, min_width=320):
                 answer_markdown = gr.Markdown(value="### Final Answer\nUpload an image and ask a question to begin.")
+                with gr.Tabs():
+                    with gr.Tab("Chain of Thought"):
+                        chain_markdown = gr.Markdown("_No reasoning steps yet._")
+                        chain_prompt = gr.Markdown("```text\nAwaiting prompt...\n```")
+                    with gr.Tab("ROI Extraction"):
+                        roi_overview_image = gr.Image(label="Annotated image", value=None)
+                        roi_gallery = gr.Gallery(
+                            label="Evidence gallery",
                             columns=2,
                             height=280,
                             allow_preview=True,
                         )
+                        roi_prompt_markdown = gr.Markdown("```text\nAwaiting ROI prompts...\n```")
+                    with gr.Tab("Evidence Descriptions"):
+                        evidence_markdown = gr.Markdown("_No visual evidence collected._")
+                        evidence_prompt_markdown = gr.Markdown("```text\nAwaiting ROI prompts...\n```")
+                    with gr.Tab("Answer Synthesis"):
+                        answer_process_markdown = gr.Markdown("_No answer generated yet._")
+                        answer_prompt_markdown = gr.Markdown("```text\nAwaiting answer prompt...\n```")
         def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
             pipeline_state = state_data if isinstance(state_data, PipelineState) else None
                 int(max_regions),
                 model_id if model_id else None,
             )
+            return [
+                new_state,
+                payload["answer_markdown"],
+                payload["chain_markdown"],
+                payload["chain_prompt"],
+                payload["roi_overview"],
+                payload["roi_gallery"],
+                payload["roi_prompt"],
+                payload["evidence_markdown"],
+                payload["evidence_prompt"],
+                payload["answer_process_markdown"],
+                payload["answer_prompt"],
+            ]
+        output_components = [
+            state,
+            answer_markdown,
+            chain_markdown,
+            chain_prompt,
+            roi_overview_image,
+            roi_gallery,
+            roi_prompt_markdown,
+            evidence_markdown,
+            evidence_prompt_markdown,
+            answer_process_markdown,
+            answer_prompt_markdown,
+        ]
         run_button.click(
             fn=_on_submit,

corgi/pipeline.py CHANGED Viewed

@@ -1,14 +1,16 @@
 from __future__ import annotations
-from dataclasses import dataclass
-from typing import List, Protocol
 from PIL import Image
 from .types import (
     GroundedEvidence,
     ReasoningStep,
     evidences_to_serializable,
     steps_to_serializable,
 )
@@ -37,6 +39,13 @@ class SupportsQwenClient(Protocol):
     ) -> str:
         ...
 @dataclass(frozen=True)
 class PipelineResult:
@@ -46,14 +55,30 @@ class PipelineResult:
     steps: List[ReasoningStep]
     evidence: List[GroundedEvidence]
     answer: str
     def to_json(self) -> dict:
-        return {
             "question": self.question,
             "steps": steps_to_serializable(self.steps),
             "evidence": evidences_to_serializable(self.evidence),
             "answer": self.answer,
         }
 class CoRGIPipeline:
@@ -71,6 +96,7 @@ class CoRGIPipeline:
         max_steps: int = 4,
         max_regions: int = 4,
     ) -> PipelineResult:
         steps = self._vlm.structured_reasoning(image=image, question=question, max_steps=max_steps)
         evidences: List[GroundedEvidence] = []
         for step in steps:
@@ -86,7 +112,15 @@ class CoRGIPipeline:
                 continue
             evidences.extend(step_evs[:max_regions])
         answer = self._vlm.synthesize_answer(image=image, question=question, steps=steps, evidences=evidences)
-        return PipelineResult(question=question, steps=steps, evidence=evidences, answer=answer)
 __all__ = ["CoRGIPipeline", "PipelineResult"]

 from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import List, Optional, Protocol
 from PIL import Image
 from .types import (
     GroundedEvidence,
+    PromptLog,
     ReasoningStep,
     evidences_to_serializable,
+    prompt_logs_to_serializable,
     steps_to_serializable,
 )
     ) -> str:
         ...
+    def reset_logs(self) -> None:
+        ...
+    reasoning_log: Optional[PromptLog]
+    grounding_logs: List[PromptLog]
+    answer_log: Optional[PromptLog]
 @dataclass(frozen=True)
 class PipelineResult:
     steps: List[ReasoningStep]
     evidence: List[GroundedEvidence]
     answer: str
+    reasoning_log: Optional[PromptLog] = None
+    grounding_logs: List[PromptLog] = field(default_factory=list)
+    answer_log: Optional[PromptLog] = None
     def to_json(self) -> dict:
+        payload = {
             "question": self.question,
             "steps": steps_to_serializable(self.steps),
             "evidence": evidences_to_serializable(self.evidence),
             "answer": self.answer,
         }
+        reasoning_entries = (
+            prompt_logs_to_serializable([self.reasoning_log]) if self.reasoning_log else []
+        )
+        if reasoning_entries:
+            payload["reasoning_log"] = reasoning_entries[0]
+        payload["grounding_logs"] = prompt_logs_to_serializable(self.grounding_logs)
+        answer_entries = prompt_logs_to_serializable([self.answer_log]) if self.answer_log else []
+        if answer_entries:
+            payload["answer_log"] = answer_entries[0]
+        return payload
 class CoRGIPipeline:
         max_steps: int = 4,
         max_regions: int = 4,
     ) -> PipelineResult:
+        self._vlm.reset_logs()
         steps = self._vlm.structured_reasoning(image=image, question=question, max_steps=max_steps)
         evidences: List[GroundedEvidence] = []
         for step in steps:
                 continue
             evidences.extend(step_evs[:max_regions])
         answer = self._vlm.synthesize_answer(image=image, question=question, steps=steps, evidences=evidences)
+        return PipelineResult(
+            question=question,
+            steps=steps,
+            evidence=evidences,
+            answer=answer,
+            reasoning_log=self._vlm.reasoning_log,
+            grounding_logs=list(self._vlm.grounding_logs),
+            answer_log=self._vlm.answer_log,
+        )
 __all__ = ["CoRGIPipeline", "PipelineResult"]

corgi/qwen_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from .parsers import parse_roi_evidence, parse_structured_reasoning
-from .types import GroundedEvidence, ReasoningStep
 DEFAULT_REASONING_PROMPT = (
@@ -117,6 +117,24 @@ class Qwen3VLClient:
     ) -> None:
         self.config = config or QwenGenerationConfig()
         self._model, self._processor = _load_backend(self.config.model_id)
     def _chat(
         self,
@@ -162,6 +180,7 @@ class Qwen3VLClient:
     def structured_reasoning(self, image: Image.Image, question: str, max_steps: int) -> List[ReasoningStep]:
         prompt = DEFAULT_REASONING_PROMPT.format(max_steps=max_steps) + f"\nQuestion: {question}"
         response = self._chat(image=image, prompt=prompt)
         return parse_structured_reasoning(response, max_steps=max_steps)
     def extract_step_evidence(
@@ -177,6 +196,9 @@ class Qwen3VLClient:
         )
         response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
         evidences = parse_roi_evidence(response, default_step_index=step.index)
         return evidences[:max_regions]
     def synthesize_answer(
@@ -192,6 +214,7 @@ class Qwen3VLClient:
             evidence=_format_evidence_for_prompt(evidences),
         )
         response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
         return _strip_think_content(response)

 from transformers import AutoModelForImageTextToText, AutoProcessor
 from .parsers import parse_roi_evidence, parse_structured_reasoning
+from .types import GroundedEvidence, PromptLog, ReasoningStep
 DEFAULT_REASONING_PROMPT = (
     ) -> None:
         self.config = config or QwenGenerationConfig()
         self._model, self._processor = _load_backend(self.config.model_id)
+        self.reset_logs()
+    def reset_logs(self) -> None:
+        self._reasoning_log: Optional[PromptLog] = None
+        self._grounding_logs: List[PromptLog] = []
+        self._answer_log: Optional[PromptLog] = None
+    @property
+    def reasoning_log(self) -> Optional[PromptLog]:
+        return self._reasoning_log
+    @property
+    def grounding_logs(self) -> List[PromptLog]:
+        return list(self._grounding_logs)
+    @property
+    def answer_log(self) -> Optional[PromptLog]:
+        return self._answer_log
     def _chat(
         self,
     def structured_reasoning(self, image: Image.Image, question: str, max_steps: int) -> List[ReasoningStep]:
         prompt = DEFAULT_REASONING_PROMPT.format(max_steps=max_steps) + f"\nQuestion: {question}"
         response = self._chat(image=image, prompt=prompt)
+        self._reasoning_log = PromptLog(prompt=prompt, response=response, stage="reasoning")
         return parse_structured_reasoning(response, max_steps=max_steps)
     def extract_step_evidence(
         )
         response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
         evidences = parse_roi_evidence(response, default_step_index=step.index)
+        self._grounding_logs.append(
+            PromptLog(prompt=prompt, response=response, step_index=step.index, stage="grounding")
+        )
         return evidences[:max_regions]
     def synthesize_answer(
             evidence=_format_evidence_for_prompt(evidences),
         )
         response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
+        self._answer_log = PromptLog(prompt=prompt, response=response, stage="synthesis")
         return _strip_think_content(response)

corgi/types.py CHANGED Viewed

@@ -28,6 +28,16 @@ class GroundedEvidence:
     raw_source: Optional[Dict[str, object]] = None
 def steps_to_serializable(steps: List[ReasoningStep]) -> List[Dict[str, object]]:
     """Helper to convert steps into JSON-friendly dictionaries."""
@@ -59,3 +69,19 @@ def evidences_to_serializable(evidences: List[GroundedEvidence]) -> List[Dict[st
             item["raw_source"] = ev.raw_source
         serializable.append(item)
     return serializable

     raw_source: Optional[Dict[str, object]] = None
+@dataclass(frozen=True)
+class PromptLog:
+    """Capture the prompt/response pair used at a given pipeline stage."""
+    prompt: str
+    response: Optional[str] = None
+    step_index: Optional[int] = None
+    stage: Optional[str] = None
 def steps_to_serializable(steps: List[ReasoningStep]) -> List[Dict[str, object]]:
     """Helper to convert steps into JSON-friendly dictionaries."""
             item["raw_source"] = ev.raw_source
         serializable.append(item)
     return serializable
+def prompt_logs_to_serializable(logs: List[PromptLog]) -> List[Dict[str, object]]:
+    """Convert prompt logs into JSON-friendly structures."""
+    serializable: List[Dict[str, object]] = []
+    for log in logs:
+        item: Dict[str, object] = {"prompt": log.prompt}
+        if log.response is not None:
+            item["response"] = log.response
+        if log.step_index is not None:
+            item["step_index"] = log.step_index
+        if log.stage is not None:
+            item["stage"] = log.stage
+        serializable.append(item)
+    return serializable