dung-vpt-uney commited on
Commit
fe542a6
·
1 Parent(s): b02dcfa

Deploy latest CoRGI Gradio demo

Browse files
README.md CHANGED
@@ -40,3 +40,10 @@ python app.py
40
  - The Space queues requests sequentially on `cpu-basic` (ZeroGPU) hardware.
41
  - Set the `CORGI_QWEN_MODEL` environment variable to try another Qwen3-VL checkpoint (for example, `Qwen/Qwen3-VL-4B-Instruct`).
42
  - `max_steps` and `max_regions` sliders control how many reasoning steps and ROI candidates the model returns.
 
 
 
 
 
 
 
 
40
  - The Space queues requests sequentially on `cpu-basic` (ZeroGPU) hardware.
41
  - Set the `CORGI_QWEN_MODEL` environment variable to try another Qwen3-VL checkpoint (for example, `Qwen/Qwen3-VL-4B-Instruct`).
42
  - `max_steps` and `max_regions` sliders control how many reasoning steps and ROI candidates the model returns.
43
+
44
+ ## UI Overview
45
+
46
+ - **Chain of Thought**: Displays the structured reasoning steps with vision flags, alongside the exact prompt/response sent to the model.
47
+ - **ROI Extraction**: Shows the source image with every grounded bounding box plus per-evidence crops, and lists the prompts used for each verification step.
48
+ - **Evidence Descriptions**: Summarises each grounded region (bbox, description, confidence) with the associated ROI prompts.
49
+ - **Answer Synthesis**: Highlights the final answer, supporting context, and the synthesis prompt/response pair.
corgi/__pycache__/gradio_app.cpython-313.pyc CHANGED
Binary files a/corgi/__pycache__/gradio_app.cpython-313.pyc and b/corgi/__pycache__/gradio_app.cpython-313.pyc differ
 
corgi/__pycache__/pipeline.cpython-313.pyc CHANGED
Binary files a/corgi/__pycache__/pipeline.cpython-313.pyc and b/corgi/__pycache__/pipeline.cpython-313.pyc differ
 
corgi/__pycache__/qwen_client.cpython-313.pyc CHANGED
Binary files a/corgi/__pycache__/qwen_client.cpython-313.pyc and b/corgi/__pycache__/qwen_client.cpython-313.pyc differ
 
corgi/__pycache__/types.cpython-313.pyc CHANGED
Binary files a/corgi/__pycache__/types.cpython-313.pyc and b/corgi/__pycache__/types.cpython-313.pyc differ
 
corgi/cli.py CHANGED
@@ -12,7 +12,7 @@ from .pipeline import CoRGIPipeline
12
  from .qwen_client import Qwen3VLClient, QwenGenerationConfig
13
  from .types import GroundedEvidence, ReasoningStep
14
 
15
- DEFAULT_MODEL_ID = "Qwen/Qwen3-VL-4B-Thinking"
16
 
17
 
18
  def build_parser() -> argparse.ArgumentParser:
 
12
  from .qwen_client import Qwen3VLClient, QwenGenerationConfig
13
  from .types import GroundedEvidence, ReasoningStep
14
 
15
+ DEFAULT_MODEL_ID = "Qwen/Qwen3-VL-8B-Thinking"
16
 
17
 
18
  def build_parser() -> argparse.ArgumentParser:
corgi/gradio_app.py CHANGED
@@ -1,6 +1,7 @@
1
  from __future__ import annotations
2
 
3
  import logging
 
4
  from dataclasses import dataclass
5
  from typing import Callable, Dict, List, Optional, Tuple
6
 
@@ -14,7 +15,7 @@ from PIL import Image, ImageDraw
14
  from .cli import DEFAULT_MODEL_ID
15
  from .pipeline import CoRGIPipeline, PipelineResult
16
  from .qwen_client import Qwen3VLClient, QwenGenerationConfig
17
- from .types import GroundedEvidence
18
 
19
 
20
  @dataclass
@@ -129,52 +130,143 @@ def _annotate_evidence_image(
129
  return annotated
130
 
131
 
132
- def _empty_ui_payload(message: str, max_slots: int = MAX_UI_STEPS) -> Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]:
133
- return (
134
- f"### Final Answer\n{message}",
135
- ["_No step data available._" for _ in range(max_slots)],
136
- [[] for _ in range(max_slots)],
137
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
 
140
  def _prepare_ui_payload(
141
  image: Image.Image,
142
  result: PipelineResult,
143
  max_slots: int = MAX_UI_STEPS,
144
- ) -> Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]:
145
  answer_text = f"### Final Answer\n{result.answer or '(no answer returned)'}"
146
- evidences_by_step = _group_evidence_by_step(result.evidence)
147
-
148
- step_markdowns: List[str] = []
149
- step_galleries: List[List[Tuple[Image.Image, str]]] = []
150
 
151
- for slot_index in range(max_slots):
152
- if slot_index < len(result.steps):
153
- step = result.steps[slot_index]
154
- evidences = evidences_by_step.get(step.index, [])
155
- lines = [
156
- f"**Step {step.index}:** {step.statement}",
157
- f"- Needs vision: {'yes' if step.needs_vision else 'no'}",
158
- ]
159
- if step.reason:
160
- lines.append(f"- Reason: {step.reason}")
161
- if evidences:
162
- lines.append(f"- Evidence items: {len(evidences)} (see gallery below)")
163
- else:
164
- lines.append("- No visual evidence returned for this step.")
165
- step_markdowns.append("\n".join(lines))
166
-
167
- gallery_entries: List[Tuple[Image.Image, str]] = []
168
- for idx, evidence in enumerate(evidences):
169
- color = EVIDENCE_COLORS[idx % len(EVIDENCE_COLORS)]
170
- annotated = _annotate_evidence_image(image, evidence, color)
171
- gallery_entries.append((annotated, _format_evidence_caption(evidence)))
172
- step_galleries.append(gallery_entries)
173
  else:
174
- step_markdowns.append("_No step data returned._")
175
- step_galleries.append([])
176
-
177
- return answer_text, step_markdowns, step_galleries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
 
180
  if spaces is not None:
@@ -248,7 +340,7 @@ def _run_pipeline(
248
  max_steps: int,
249
  max_regions: int,
250
  model_id: Optional[str],
251
- ) -> tuple[PipelineState, Tuple[str, List[str], List[List[Tuple[Image.Image, str]]]]]:
252
  target_model = (model_id or DEFAULT_MODEL_ID).strip() or DEFAULT_MODEL_ID
253
  cached_pipeline = _PIPELINE_CACHE.get(target_model)
254
  base_state = state or PipelineState(model_id=target_model, pipeline=cached_pipeline)
@@ -325,19 +417,25 @@ def build_demo(
325
 
326
  with gr.Column(scale=1, min_width=320):
327
  answer_markdown = gr.Markdown(value="### Final Answer\nUpload an image and ask a question to begin.")
328
- step_markdown_components: List["gr.Markdown"] = []
329
- step_gallery_components: List["gr.Gallery"] = []
330
- for slot in range(MAX_UI_STEPS):
331
- with gr.Accordion(f"Step {slot + 1}", open=False):
332
- step_md = gr.Markdown("_No step data available._")
333
- gallery = gr.Gallery(
334
- label="Visual evidence",
 
335
  columns=2,
336
  height=280,
337
  allow_preview=True,
338
  )
339
- step_markdown_components.append(step_md)
340
- step_gallery_components.append(gallery)
 
 
 
 
 
341
 
342
  def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
343
  pipeline_state = state_data if isinstance(state_data, PipelineState) else None
@@ -349,15 +447,33 @@ def build_demo(
349
  int(max_regions),
350
  model_id if model_id else None,
351
  )
352
- answer_text, step_texts, gallery_payloads = payload
353
- outputs: List[object] = [new_state, answer_text]
354
- outputs.extend(step_texts)
355
- outputs.extend(gallery_payloads)
356
- return outputs
357
-
358
- output_components = [state, answer_markdown]
359
- output_components.extend(step_markdown_components)
360
- output_components.extend(step_gallery_components)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
  run_button.click(
363
  fn=_on_submit,
 
1
  from __future__ import annotations
2
 
3
  import logging
4
+ import itertools
5
  from dataclasses import dataclass
6
  from typing import Callable, Dict, List, Optional, Tuple
7
 
 
15
  from .cli import DEFAULT_MODEL_ID
16
  from .pipeline import CoRGIPipeline, PipelineResult
17
  from .qwen_client import Qwen3VLClient, QwenGenerationConfig
18
+ from .types import GroundedEvidence, PromptLog
19
 
20
 
21
  @dataclass
 
130
  return annotated
131
 
132
 
133
+ def _empty_ui_payload(message: str) -> Dict[str, object]:
134
+ placeholder_prompt = f"```text\n{message}\n```"
135
+ return {
136
+ "answer_markdown": f"### Final Answer\n{message}",
137
+ "chain_markdown": message,
138
+ "chain_prompt": placeholder_prompt,
139
+ "roi_overview": None,
140
+ "roi_gallery": [],
141
+ "roi_prompt": placeholder_prompt,
142
+ "evidence_markdown": message,
143
+ "evidence_prompt": placeholder_prompt,
144
+ "answer_process_markdown": message,
145
+ "answer_prompt": placeholder_prompt,
146
+ }
147
+
148
+
149
+ def _annotate_overview_image(image: Image.Image, evidences: List[GroundedEvidence]) -> Optional[Image.Image]:
150
+ if not evidences:
151
+ return None
152
+ base = image.copy().convert("RGBA")
153
+ overlay = Image.new("RGBA", base.size, (0, 0, 0, 0))
154
+ draw = ImageDraw.Draw(overlay)
155
+ width, height = base.size
156
+
157
+ step_colors: Dict[int, Tuple[int, int, int]] = {}
158
+ color_cycle = itertools.cycle(EVIDENCE_COLORS)
159
+ for ev in evidences:
160
+ color = step_colors.setdefault(ev.step_index, next(color_cycle))
161
+ x1 = max(0, min(int(ev.bbox[0] * width), width - 1))
162
+ y1 = max(0, min(int(ev.bbox[1] * height), height - 1))
163
+ x2 = max(0, min(int(ev.bbox[2] * width), width - 1))
164
+ y2 = max(0, min(int(ev.bbox[3] * height), height - 1))
165
+ x1, x2 = sorted((x1, x2))
166
+ y1, y2 = sorted((y1, y2))
167
+ outline_width = max(2, int(min(width, height) * 0.005))
168
+ rgba_color = color + (255,)
169
+ fill_color = color + (60,)
170
+ draw.rectangle([x1, y1, x2, y2], outline=rgba_color, width=outline_width)
171
+ label = f"S{ev.step_index}"
172
+ draw.text((x1 + 4, y1 + 4), label, fill=rgba_color)
173
+
174
+ annotated = Image.alpha_composite(base, overlay).convert("RGB")
175
+ if max(annotated.size) > GALLERY_MAX_DIM:
176
+ annotated.thumbnail((GALLERY_MAX_DIM, GALLERY_MAX_DIM), _THUMBNAIL_RESAMPLE)
177
+ return annotated
178
+
179
+
180
+ def _format_prompt_markdown(log: Optional[PromptLog], title: str) -> str:
181
+ if log is None:
182
+ return f"**{title} Prompt**\n_Prompt unavailable._"
183
+ lines = [f"**{title} Prompt**", "```text", log.prompt, "```"]
184
+ if log.response:
185
+ lines.extend(["**Model Response**", "```text", log.response, "```"])
186
+ return "\n".join(lines)
187
+
188
+
189
+ def _format_grounding_prompts(logs: List[PromptLog]) -> str:
190
+ if not logs:
191
+ return "_No ROI prompts available._"
192
+ blocks: List[str] = []
193
+ for log in logs:
194
+ heading = f"#### Step {log.step_index}" if log.step_index is not None else "#### ROI Prompt"
195
+ sections = [heading, "**Prompt**", "```text", log.prompt, "```"]
196
+ if log.response:
197
+ sections.extend(["**Model Response**", "```text", log.response, "```"])
198
+ blocks.append("\n".join(sections))
199
+ return "\n\n".join(blocks)
200
 
201
 
202
  def _prepare_ui_payload(
203
  image: Image.Image,
204
  result: PipelineResult,
205
  max_slots: int = MAX_UI_STEPS,
206
+ ) -> Dict[str, object]:
207
  answer_text = f"### Final Answer\n{result.answer or '(no answer returned)'}"
 
 
 
 
208
 
209
+ step_lines: List[str] = []
210
+ evidences_by_step = _group_evidence_by_step(result.evidence)
211
+ for step in result.steps[:max_slots]:
212
+ lines = [
213
+ f"**Step {step.index}:** {step.statement}",
214
+ f"- Needs vision: {'yes' if step.needs_vision else 'no'}",
215
+ ]
216
+ if step.reason:
217
+ lines.append(f"- Reason: {step.reason}")
218
+ evs = evidences_by_step.get(step.index, [])
219
+ if evs:
220
+ lines.append(f"- Visual evidence items: {len(evs)}")
 
 
 
 
 
 
 
 
 
 
221
  else:
222
+ lines.append("- No visual evidence returned for this step.")
223
+ step_lines.append("\n".join(lines))
224
+ if len(result.steps) > max_slots:
225
+ step_lines.append(f"_Only the first {max_slots} steps are shown._")
226
+ chain_markdown = "\n\n".join(step_lines) if step_lines else "_No reasoning steps returned._"
227
+
228
+ roi_overview = _annotate_overview_image(image, result.evidence)
229
+ aggregated_gallery: List[Tuple[Image.Image, str]] = []
230
+ for idx, evidence in enumerate(result.evidence):
231
+ color = EVIDENCE_COLORS[idx % len(EVIDENCE_COLORS)]
232
+ annotated = _annotate_evidence_image(image, evidence, color)
233
+ aggregated_gallery.append((annotated, _format_evidence_caption(evidence)))
234
+
235
+ evidence_blocks: List[str] = []
236
+ for idx, evidence in enumerate(result.evidence, start=1):
237
+ bbox = ", ".join(f"{coord:.2f}" for coord in evidence.bbox)
238
+ desc = evidence.description or "(no description)"
239
+ conf = f"Confidence: {evidence.confidence:.2f}" if evidence.confidence is not None else "Confidence: n/a"
240
+ evidence_blocks.append(
241
+ f"**Evidence {idx} — Step {evidence.step_index}**\n- {desc}\n- {conf}\n- BBox: ({bbox})"
242
+ )
243
+ evidence_markdown = "\n\n".join(evidence_blocks) if evidence_blocks else "_No visual evidence collected._"
244
+
245
+ reasoning_prompt_md = _format_prompt_markdown(result.reasoning_log, "Reasoning")
246
+ roi_prompt_md = _format_grounding_prompts(result.grounding_logs)
247
+ evidence_prompt_md = roi_prompt_md if result.grounding_logs else "_No ROI prompts available._"
248
+ answer_prompt_md = _format_prompt_markdown(result.answer_log, "Answer Synthesis")
249
+
250
+ answer_process_lines = [
251
+ f"**Question:** {result.question}",
252
+ f"**Final Answer:** {result.answer or '(no answer returned)'}",
253
+ f"**Steps considered:** {len(result.steps)}",
254
+ f"**Visual evidence items:** {len(result.evidence)}",
255
+ ]
256
+ answer_process_markdown = "\n".join(answer_process_lines)
257
+
258
+ return {
259
+ "answer_markdown": answer_text,
260
+ "chain_markdown": chain_markdown,
261
+ "chain_prompt": reasoning_prompt_md,
262
+ "roi_overview": roi_overview,
263
+ "roi_gallery": aggregated_gallery,
264
+ "roi_prompt": roi_prompt_md,
265
+ "evidence_markdown": evidence_markdown,
266
+ "evidence_prompt": evidence_prompt_md,
267
+ "answer_process_markdown": answer_process_markdown,
268
+ "answer_prompt": answer_prompt_md,
269
+ }
270
 
271
 
272
  if spaces is not None:
 
340
  max_steps: int,
341
  max_regions: int,
342
  model_id: Optional[str],
343
+ ) -> tuple[PipelineState, Dict[str, object]]:
344
  target_model = (model_id or DEFAULT_MODEL_ID).strip() or DEFAULT_MODEL_ID
345
  cached_pipeline = _PIPELINE_CACHE.get(target_model)
346
  base_state = state or PipelineState(model_id=target_model, pipeline=cached_pipeline)
 
417
 
418
  with gr.Column(scale=1, min_width=320):
419
  answer_markdown = gr.Markdown(value="### Final Answer\nUpload an image and ask a question to begin.")
420
+ with gr.Tabs():
421
+ with gr.Tab("Chain of Thought"):
422
+ chain_markdown = gr.Markdown("_No reasoning steps yet._")
423
+ chain_prompt = gr.Markdown("```text\nAwaiting prompt...\n```")
424
+ with gr.Tab("ROI Extraction"):
425
+ roi_overview_image = gr.Image(label="Annotated image", value=None)
426
+ roi_gallery = gr.Gallery(
427
+ label="Evidence gallery",
428
  columns=2,
429
  height=280,
430
  allow_preview=True,
431
  )
432
+ roi_prompt_markdown = gr.Markdown("```text\nAwaiting ROI prompts...\n```")
433
+ with gr.Tab("Evidence Descriptions"):
434
+ evidence_markdown = gr.Markdown("_No visual evidence collected._")
435
+ evidence_prompt_markdown = gr.Markdown("```text\nAwaiting ROI prompts...\n```")
436
+ with gr.Tab("Answer Synthesis"):
437
+ answer_process_markdown = gr.Markdown("_No answer generated yet._")
438
+ answer_prompt_markdown = gr.Markdown("```text\nAwaiting answer prompt...\n```")
439
 
440
  def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
441
  pipeline_state = state_data if isinstance(state_data, PipelineState) else None
 
447
  int(max_regions),
448
  model_id if model_id else None,
449
  )
450
+ return [
451
+ new_state,
452
+ payload["answer_markdown"],
453
+ payload["chain_markdown"],
454
+ payload["chain_prompt"],
455
+ payload["roi_overview"],
456
+ payload["roi_gallery"],
457
+ payload["roi_prompt"],
458
+ payload["evidence_markdown"],
459
+ payload["evidence_prompt"],
460
+ payload["answer_process_markdown"],
461
+ payload["answer_prompt"],
462
+ ]
463
+
464
+ output_components = [
465
+ state,
466
+ answer_markdown,
467
+ chain_markdown,
468
+ chain_prompt,
469
+ roi_overview_image,
470
+ roi_gallery,
471
+ roi_prompt_markdown,
472
+ evidence_markdown,
473
+ evidence_prompt_markdown,
474
+ answer_process_markdown,
475
+ answer_prompt_markdown,
476
+ ]
477
 
478
  run_button.click(
479
  fn=_on_submit,
corgi/pipeline.py CHANGED
@@ -1,14 +1,16 @@
1
  from __future__ import annotations
2
 
3
- from dataclasses import dataclass
4
- from typing import List, Protocol
5
 
6
  from PIL import Image
7
 
8
  from .types import (
9
  GroundedEvidence,
 
10
  ReasoningStep,
11
  evidences_to_serializable,
 
12
  steps_to_serializable,
13
  )
14
 
@@ -37,6 +39,13 @@ class SupportsQwenClient(Protocol):
37
  ) -> str:
38
  ...
39
 
 
 
 
 
 
 
 
40
 
41
  @dataclass(frozen=True)
42
  class PipelineResult:
@@ -46,14 +55,30 @@ class PipelineResult:
46
  steps: List[ReasoningStep]
47
  evidence: List[GroundedEvidence]
48
  answer: str
 
 
 
49
 
50
  def to_json(self) -> dict:
51
- return {
52
  "question": self.question,
53
  "steps": steps_to_serializable(self.steps),
54
  "evidence": evidences_to_serializable(self.evidence),
55
  "answer": self.answer,
56
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  class CoRGIPipeline:
@@ -71,6 +96,7 @@ class CoRGIPipeline:
71
  max_steps: int = 4,
72
  max_regions: int = 4,
73
  ) -> PipelineResult:
 
74
  steps = self._vlm.structured_reasoning(image=image, question=question, max_steps=max_steps)
75
  evidences: List[GroundedEvidence] = []
76
  for step in steps:
@@ -86,7 +112,15 @@ class CoRGIPipeline:
86
  continue
87
  evidences.extend(step_evs[:max_regions])
88
  answer = self._vlm.synthesize_answer(image=image, question=question, steps=steps, evidences=evidences)
89
- return PipelineResult(question=question, steps=steps, evidence=evidences, answer=answer)
 
 
 
 
 
 
 
 
90
 
91
 
92
  __all__ = ["CoRGIPipeline", "PipelineResult"]
 
1
  from __future__ import annotations
2
 
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional, Protocol
5
 
6
  from PIL import Image
7
 
8
  from .types import (
9
  GroundedEvidence,
10
+ PromptLog,
11
  ReasoningStep,
12
  evidences_to_serializable,
13
+ prompt_logs_to_serializable,
14
  steps_to_serializable,
15
  )
16
 
 
39
  ) -> str:
40
  ...
41
 
42
+ def reset_logs(self) -> None:
43
+ ...
44
+
45
+ reasoning_log: Optional[PromptLog]
46
+ grounding_logs: List[PromptLog]
47
+ answer_log: Optional[PromptLog]
48
+
49
 
50
  @dataclass(frozen=True)
51
  class PipelineResult:
 
55
  steps: List[ReasoningStep]
56
  evidence: List[GroundedEvidence]
57
  answer: str
58
+ reasoning_log: Optional[PromptLog] = None
59
+ grounding_logs: List[PromptLog] = field(default_factory=list)
60
+ answer_log: Optional[PromptLog] = None
61
 
62
  def to_json(self) -> dict:
63
+ payload = {
64
  "question": self.question,
65
  "steps": steps_to_serializable(self.steps),
66
  "evidence": evidences_to_serializable(self.evidence),
67
  "answer": self.answer,
68
  }
69
+ reasoning_entries = (
70
+ prompt_logs_to_serializable([self.reasoning_log]) if self.reasoning_log else []
71
+ )
72
+ if reasoning_entries:
73
+ payload["reasoning_log"] = reasoning_entries[0]
74
+
75
+ payload["grounding_logs"] = prompt_logs_to_serializable(self.grounding_logs)
76
+
77
+ answer_entries = prompt_logs_to_serializable([self.answer_log]) if self.answer_log else []
78
+ if answer_entries:
79
+ payload["answer_log"] = answer_entries[0]
80
+
81
+ return payload
82
 
83
 
84
  class CoRGIPipeline:
 
96
  max_steps: int = 4,
97
  max_regions: int = 4,
98
  ) -> PipelineResult:
99
+ self._vlm.reset_logs()
100
  steps = self._vlm.structured_reasoning(image=image, question=question, max_steps=max_steps)
101
  evidences: List[GroundedEvidence] = []
102
  for step in steps:
 
112
  continue
113
  evidences.extend(step_evs[:max_regions])
114
  answer = self._vlm.synthesize_answer(image=image, question=question, steps=steps, evidences=evidences)
115
+ return PipelineResult(
116
+ question=question,
117
+ steps=steps,
118
+ evidence=evidences,
119
+ answer=answer,
120
+ reasoning_log=self._vlm.reasoning_log,
121
+ grounding_logs=list(self._vlm.grounding_logs),
122
+ answer_log=self._vlm.answer_log,
123
+ )
124
 
125
 
126
  __all__ = ["CoRGIPipeline", "PipelineResult"]
corgi/qwen_client.py CHANGED
@@ -8,7 +8,7 @@ from PIL import Image
8
  from transformers import AutoModelForImageTextToText, AutoProcessor
9
 
10
  from .parsers import parse_roi_evidence, parse_structured_reasoning
11
- from .types import GroundedEvidence, ReasoningStep
12
 
13
 
14
  DEFAULT_REASONING_PROMPT = (
@@ -117,6 +117,24 @@ class Qwen3VLClient:
117
  ) -> None:
118
  self.config = config or QwenGenerationConfig()
119
  self._model, self._processor = _load_backend(self.config.model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  def _chat(
122
  self,
@@ -162,6 +180,7 @@ class Qwen3VLClient:
162
  def structured_reasoning(self, image: Image.Image, question: str, max_steps: int) -> List[ReasoningStep]:
163
  prompt = DEFAULT_REASONING_PROMPT.format(max_steps=max_steps) + f"\nQuestion: {question}"
164
  response = self._chat(image=image, prompt=prompt)
 
165
  return parse_structured_reasoning(response, max_steps=max_steps)
166
 
167
  def extract_step_evidence(
@@ -177,6 +196,9 @@ class Qwen3VLClient:
177
  )
178
  response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
179
  evidences = parse_roi_evidence(response, default_step_index=step.index)
 
 
 
180
  return evidences[:max_regions]
181
 
182
  def synthesize_answer(
@@ -192,6 +214,7 @@ class Qwen3VLClient:
192
  evidence=_format_evidence_for_prompt(evidences),
193
  )
194
  response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
 
195
  return _strip_think_content(response)
196
 
197
 
 
8
  from transformers import AutoModelForImageTextToText, AutoProcessor
9
 
10
  from .parsers import parse_roi_evidence, parse_structured_reasoning
11
+ from .types import GroundedEvidence, PromptLog, ReasoningStep
12
 
13
 
14
  DEFAULT_REASONING_PROMPT = (
 
117
  ) -> None:
118
  self.config = config or QwenGenerationConfig()
119
  self._model, self._processor = _load_backend(self.config.model_id)
120
+ self.reset_logs()
121
+
122
+ def reset_logs(self) -> None:
123
+ self._reasoning_log: Optional[PromptLog] = None
124
+ self._grounding_logs: List[PromptLog] = []
125
+ self._answer_log: Optional[PromptLog] = None
126
+
127
+ @property
128
+ def reasoning_log(self) -> Optional[PromptLog]:
129
+ return self._reasoning_log
130
+
131
+ @property
132
+ def grounding_logs(self) -> List[PromptLog]:
133
+ return list(self._grounding_logs)
134
+
135
+ @property
136
+ def answer_log(self) -> Optional[PromptLog]:
137
+ return self._answer_log
138
 
139
  def _chat(
140
  self,
 
180
  def structured_reasoning(self, image: Image.Image, question: str, max_steps: int) -> List[ReasoningStep]:
181
  prompt = DEFAULT_REASONING_PROMPT.format(max_steps=max_steps) + f"\nQuestion: {question}"
182
  response = self._chat(image=image, prompt=prompt)
183
+ self._reasoning_log = PromptLog(prompt=prompt, response=response, stage="reasoning")
184
  return parse_structured_reasoning(response, max_steps=max_steps)
185
 
186
  def extract_step_evidence(
 
196
  )
197
  response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
198
  evidences = parse_roi_evidence(response, default_step_index=step.index)
199
+ self._grounding_logs.append(
200
+ PromptLog(prompt=prompt, response=response, step_index=step.index, stage="grounding")
201
+ )
202
  return evidences[:max_regions]
203
 
204
  def synthesize_answer(
 
214
  evidence=_format_evidence_for_prompt(evidences),
215
  )
216
  response = self._chat(image=image, prompt=prompt, max_new_tokens=256)
217
+ self._answer_log = PromptLog(prompt=prompt, response=response, stage="synthesis")
218
  return _strip_think_content(response)
219
 
220
 
corgi/types.py CHANGED
@@ -28,6 +28,16 @@ class GroundedEvidence:
28
  raw_source: Optional[Dict[str, object]] = None
29
 
30
 
 
 
 
 
 
 
 
 
 
 
31
  def steps_to_serializable(steps: List[ReasoningStep]) -> List[Dict[str, object]]:
32
  """Helper to convert steps into JSON-friendly dictionaries."""
33
 
@@ -59,3 +69,19 @@ def evidences_to_serializable(evidences: List[GroundedEvidence]) -> List[Dict[st
59
  item["raw_source"] = ev.raw_source
60
  serializable.append(item)
61
  return serializable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  raw_source: Optional[Dict[str, object]] = None
29
 
30
 
31
+ @dataclass(frozen=True)
32
+ class PromptLog:
33
+ """Capture the prompt/response pair used at a given pipeline stage."""
34
+
35
+ prompt: str
36
+ response: Optional[str] = None
37
+ step_index: Optional[int] = None
38
+ stage: Optional[str] = None
39
+
40
+
41
  def steps_to_serializable(steps: List[ReasoningStep]) -> List[Dict[str, object]]:
42
  """Helper to convert steps into JSON-friendly dictionaries."""
43
 
 
69
  item["raw_source"] = ev.raw_source
70
  serializable.append(item)
71
  return serializable
72
+
73
+
74
+ def prompt_logs_to_serializable(logs: List[PromptLog]) -> List[Dict[str, object]]:
75
+ """Convert prompt logs into JSON-friendly structures."""
76
+
77
+ serializable: List[Dict[str, object]] = []
78
+ for log in logs:
79
+ item: Dict[str, object] = {"prompt": log.prompt}
80
+ if log.response is not None:
81
+ item["response"] = log.response
82
+ if log.step_index is not None:
83
+ item["step_index"] = log.step_index
84
+ if log.stage is not None:
85
+ item["stage"] = log.stage
86
+ serializable.append(item)
87
+ return serializable