davanstrien HF Staff commited on
Commit
dbda7b0
·
1 Parent(s): c49cb47

Add markdown rendering support for VLM output

Browse files
Files changed (4) hide show
  1. css/styles.css +96 -0
  2. index.html +38 -5
  3. js/app.js +149 -0
  4. js/diff-utils.js +38 -0
css/styles.css CHANGED
@@ -194,4 +194,100 @@ button:focus, input:focus, select:focus {
194
  counter-increment: line;
195
  content: counter(line);
196
  @apply inline-block w-12 mr-4 text-right text-gray-400 dark:text-gray-600 select-none;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  }
 
194
  counter-increment: line;
195
  content: counter(line);
196
  @apply inline-block w-12 mr-4 text-right text-gray-400 dark:text-gray-600 select-none;
197
+ }
198
+
199
+ /* Markdown content styling */
200
+ .markdown-content {
201
+ @apply text-gray-900 dark:text-gray-100;
202
+ }
203
+
204
+ .markdown-content h1 {
205
+ @apply text-2xl font-bold mb-4 text-gray-900 dark:text-gray-100;
206
+ }
207
+
208
+ .markdown-content h2 {
209
+ @apply text-xl font-bold mb-3 text-gray-900 dark:text-gray-100;
210
+ }
211
+
212
+ .markdown-content h3 {
213
+ @apply text-lg font-bold mb-2 text-gray-900 dark:text-gray-100;
214
+ }
215
+
216
+ .markdown-content h4 {
217
+ @apply text-base font-bold mb-2 text-gray-900 dark:text-gray-100;
218
+ }
219
+
220
+ .markdown-content p {
221
+ @apply mb-4 leading-relaxed;
222
+ }
223
+
224
+ .markdown-content ul {
225
+ @apply list-disc list-inside mb-4 pl-4;
226
+ }
227
+
228
+ .markdown-content ol {
229
+ @apply list-decimal list-inside mb-4 pl-4;
230
+ }
231
+
232
+ .markdown-content li {
233
+ @apply mb-1;
234
+ }
235
+
236
+ .markdown-content blockquote {
237
+ @apply border-l-4 border-gray-300 dark:border-gray-600 pl-4 my-4 italic text-gray-700 dark:text-gray-300;
238
+ }
239
+
240
+ .markdown-content code {
241
+ @apply bg-gray-100 dark:bg-gray-800 px-1 py-0.5 rounded text-sm font-mono;
242
+ }
243
+
244
+ .markdown-content pre {
245
+ @apply bg-gray-100 dark:bg-gray-800 p-4 rounded-lg overflow-x-auto mb-4;
246
+ }
247
+
248
+ .markdown-content pre code {
249
+ @apply bg-transparent p-0;
250
+ }
251
+
252
+ .markdown-content a {
253
+ @apply text-blue-600 dark:text-blue-400 underline hover:text-blue-800 dark:hover:text-blue-300;
254
+ }
255
+
256
+ .markdown-content table {
257
+ @apply w-full mb-4 border-collapse;
258
+ display: table !important;
259
+ }
260
+
261
+ .markdown-content th, .markdown-content td {
262
+ @apply border border-gray-300 dark:border-gray-600 px-3 py-2;
263
+ }
264
+
265
+ .markdown-content th {
266
+ @apply bg-gray-100 dark:bg-gray-800 font-bold;
267
+ }
268
+
269
+ /* Override prose styles for tables */
270
+ .prose table {
271
+ @apply w-full mb-4 border-collapse;
272
+ display: table !important;
273
+ }
274
+
275
+ .prose thead {
276
+ @apply bg-gray-50 dark:bg-gray-800;
277
+ }
278
+
279
+ .prose tbody {
280
+ @apply bg-white dark:bg-gray-900;
281
+ }
282
+
283
+ .prose th {
284
+ @apply px-3 py-3.5 text-left text-sm font-semibold text-gray-900 dark:text-gray-100 border border-gray-300 dark:border-gray-600;
285
+ }
286
+
287
+ .prose td {
288
+ @apply px-3 py-4 text-sm text-gray-900 dark:text-gray-100 border border-gray-300 dark:border-gray-600;
289
+ }
290
+
291
+ .markdown-content hr {
292
+ @apply border-t border-gray-300 dark:border-gray-600 my-6;
293
  }
index.html CHANGED
@@ -9,6 +9,7 @@
9
  <script src="https://unpkg.com/[email protected]"></script>
10
  <script src="https://unpkg.com/[email protected]/dist/cdn.min.js" defer></script>
11
  <script src="https://cdn.tailwindcss.com"></script>
 
12
 
13
  <!-- Tailwind Config -->
14
  <script>
@@ -98,8 +99,21 @@
98
  <option value="char">Character Diff</option>
99
  <option value="word">Word Diff</option>
100
  <option value="line">Line Diff</option>
 
101
  </select>
102
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  <button
104
  @click="exportComparison()"
105
  class="p-2 text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200"
@@ -241,6 +255,14 @@
241
  <span x-text="wordStats.original || '-'"></span> → <span x-text="wordStats.improved || '-'"></span>
242
  </span>
243
  </div>
 
 
 
 
 
 
 
 
244
  </div>
245
  </div>
246
  </div>
@@ -287,9 +309,17 @@
287
  </div>
288
  </div>
289
  <div>
290
- <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">Improved OCR</h3>
291
- <div class="prose prose-sm dark:prose-invert max-w-none">
292
- <pre class="whitespace-pre-wrap font-mono text-xs bg-gray-50 dark:bg-gray-800 text-gray-900 dark:text-gray-100 p-4 rounded-lg" x-text="getImprovedText()"></pre>
 
 
 
 
 
 
 
 
293
  </div>
294
  </div>
295
  </div>
@@ -300,8 +330,11 @@
300
  </div>
301
 
302
  <!-- Improved Only -->
303
- <div x-show="activeTab === 'improved'" class="prose prose-sm dark:prose-invert max-w-none">
304
- <pre class="whitespace-pre-wrap font-mono text-xs bg-gray-50 dark:bg-gray-800 text-gray-900 dark:text-gray-100 p-4 rounded-lg" x-text="getImprovedText()"></pre>
 
 
 
305
  </div>
306
  </div>
307
  </div>
 
9
  <script src="https://unpkg.com/[email protected]"></script>
10
  <script src="https://unpkg.com/[email protected]/dist/cdn.min.js" defer></script>
11
  <script src="https://cdn.tailwindcss.com"></script>
12
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
13
 
14
  <!-- Tailwind Config -->
15
  <script>
 
99
  <option value="char">Character Diff</option>
100
  <option value="word">Word Diff</option>
101
  <option value="line">Line Diff</option>
102
+ <option value="markdown" x-show="hasMarkdown">Markdown Diff</option>
103
  </select>
104
 
105
+ <button
106
+ x-show="hasMarkdown"
107
+ @click="renderMarkdown = !renderMarkdown"
108
+ :class="renderMarkdown ? 'bg-blue-100 dark:bg-blue-900 text-blue-700 dark:text-blue-300' : 'text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200'"
109
+ class="p-2 rounded-md transition-colors"
110
+ title="Toggle markdown rendering"
111
+ >
112
+ <svg class="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
113
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"></path>
114
+ </svg>
115
+ </button>
116
+
117
  <button
118
  @click="exportComparison()"
119
  class="p-2 text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200"
 
255
  <span x-text="wordStats.original || '-'"></span> → <span x-text="wordStats.improved || '-'"></span>
256
  </span>
257
  </div>
258
+ <div x-show="hasMarkdown" class="mt-2 flex items-center justify-center">
259
+ <span class="inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-purple-100 dark:bg-purple-900 text-purple-800 dark:text-purple-200">
260
+ <svg class="w-3 h-3 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
261
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"></path>
262
+ </svg>
263
+ Markdown Detected
264
+ </span>
265
+ </div>
266
  </div>
267
  </div>
268
  </div>
 
309
  </div>
310
  </div>
311
  <div>
312
+ <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">
313
+ Improved OCR
314
+ <span x-show="renderMarkdown && hasMarkdown" class="ml-2 text-xs bg-blue-100 dark:bg-blue-900 text-blue-700 dark:text-blue-300 px-2 py-1 rounded">
315
+ Markdown
316
+ </span>
317
+ </h3>
318
+ <div class="max-w-none">
319
+ <div x-show="!renderMarkdown">
320
+ <pre class="whitespace-pre-wrap font-mono text-xs bg-gray-50 dark:bg-gray-800 text-gray-900 dark:text-gray-100 p-4 rounded-lg" x-text="getImprovedText()"></pre>
321
+ </div>
322
+ <div x-show="renderMarkdown" x-html="getImprovedTextRendered()" class="markdown-content"></div>
323
  </div>
324
  </div>
325
  </div>
 
330
  </div>
331
 
332
  <!-- Improved Only -->
333
+ <div x-show="activeTab === 'improved'" class="max-w-none">
334
+ <div x-show="!renderMarkdown">
335
+ <pre class="whitespace-pre-wrap font-mono text-xs bg-gray-50 dark:bg-gray-800 text-gray-900 dark:text-gray-100 p-4 rounded-lg" x-text="getImprovedText()"></pre>
336
+ </div>
337
+ <div x-show="renderMarkdown" x-html="getImprovedTextRendered()" class="markdown-content"></div>
338
  </div>
339
  </div>
340
  </div>
js/app.js CHANGED
@@ -24,6 +24,8 @@ document.addEventListener('alpine:init', () => {
24
  showAbout: false,
25
  showFlowView: false,
26
  showDock: false,
 
 
27
 
28
  // Flow view state
29
  flowItems: [],
@@ -48,6 +50,9 @@ document.addEventListener('alpine:init', () => {
48
  // API instance
49
  api: null,
50
 
 
 
 
51
  async init() {
52
  // Initialize API
53
  this.api = new DatasetAPI();
@@ -125,6 +130,9 @@ document.addEventListener('alpine:init', () => {
125
  this.loading = true;
126
  this.error = null;
127
 
 
 
 
128
  try {
129
  // Validate dataset
130
  await this.api.validateDataset(this.datasetId);
@@ -164,6 +172,10 @@ document.addEventListener('alpine:init', () => {
164
  this.currentSample = data.row;
165
  this.currentIndex = index;
166
 
 
 
 
 
167
  // Update diff when sample changes
168
  this.updateDiff();
169
 
@@ -215,6 +227,139 @@ document.addEventListener('alpine:init', () => {
215
  return this.currentSample[columns.improvedText] || 'No improved text found';
216
  },
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  getImageData() {
219
  if (!this.currentSample) return null;
220
  const columns = this.api.detectColumns(null, this.currentSample);
@@ -252,6 +397,10 @@ document.addEventListener('alpine:init', () => {
252
  case 'line':
253
  this.diffHtml = createLineDiff(original, improved);
254
  break;
 
 
 
 
255
  }
256
  },
257
 
 
24
  showAbout: false,
25
  showFlowView: false,
26
  showDock: false,
27
+ renderMarkdown: false,
28
+ hasMarkdown: false,
29
 
30
  // Flow view state
31
  flowItems: [],
 
50
  // API instance
51
  api: null,
52
 
53
+ // Markdown cache
54
+ markdownCache: new Map(),
55
+
56
  async init() {
57
  // Initialize API
58
  this.api = new DatasetAPI();
 
130
  this.loading = true;
131
  this.error = null;
132
 
133
+ // Clear markdown cache when loading new dataset
134
+ this.markdownCache.clear();
135
+
136
  try {
137
  // Validate dataset
138
  await this.api.validateDataset(this.datasetId);
 
172
  this.currentSample = data.row;
173
  this.currentIndex = index;
174
 
175
+ // Check if improved text contains markdown
176
+ const improvedText = this.getImprovedText();
177
+ this.hasMarkdown = this.detectMarkdown(improvedText);
178
+
179
  // Update diff when sample changes
180
  this.updateDiff();
181
 
 
227
  return this.currentSample[columns.improvedText] || 'No improved text found';
228
  },
229
 
230
+ detectMarkdown(text) {
231
+ // Check for common markdown patterns
232
+ const markdownPatterns = [
233
+ /^#{1,6}\s/m, // Headers
234
+ /\*\*[^*]+\*\*/, // Bold
235
+ /\*[^*]+\*/, // Italic
236
+ /\[[^\]]+\]\([^)]+\)/, // Links
237
+ /^[-*+]\s/m, // Lists
238
+ /^\d+\.\s/m, // Numbered lists
239
+ /^>/m, // Blockquotes
240
+ /```[\s\S]*?```/, // Code blocks
241
+ /`[^`]+`/, // Inline code
242
+ /\|.*\|.*\|/m, // Tables (basic detection)
243
+ /<table>/i, // HTML tables
244
+ /<thead>/i // HTML table headers
245
+ ];
246
+
247
+ return markdownPatterns.some(pattern => pattern.test(text));
248
+ },
249
+
250
+ escapeHtml(text) {
251
+ const div = document.createElement('div');
252
+ div.textContent = text;
253
+ return div.innerHTML;
254
+ },
255
+
256
+ renderMarkdownText(text) {
257
+ if (!text || !this.renderMarkdown) return text;
258
+
259
+ // Check cache first
260
+ const cacheKey = `${this.currentIndex}_${text.substring(0, 100)}`;
261
+ if (this.markdownCache.has(cacheKey)) {
262
+ return this.markdownCache.get(cacheKey);
263
+ }
264
+
265
+ try {
266
+ // Configure marked options for security
267
+ const renderer = new marked.Renderer();
268
+
269
+ // Override link rendering to open in new tab and sanitize
270
+ const self = this;
271
+ renderer.link = function(href, title, text) {
272
+ // Basic URL sanitization
273
+ const safeHref = href.replace(/javascript:/gi, '').replace(/data:/gi, '');
274
+ const safeTitle = (title || '').replace(/"/g, '&quot;');
275
+ const safeText = self.escapeHtml(text);
276
+ return `<a href="${safeHref}" title="${safeTitle}" target="_blank" rel="noopener noreferrer" class="text-blue-600 dark:text-blue-400 underline">${safeText}</a>`;
277
+ };
278
+
279
+ // Override image rendering for safety
280
+ renderer.image = function(href, title, text) {
281
+ const safeHref = href.replace(/javascript:/gi, '').replace(/data:/gi, '');
282
+ const safeTitle = (title || '').replace(/"/g, '&quot;');
283
+ const safeAlt = self.escapeHtml(text);
284
+ return `<img src="${safeHref}" alt="${safeAlt}" title="${safeTitle}" class="max-w-full h-auto rounded">`;
285
+ };
286
+
287
+ // Override HTML rendering to prevent XSS but allow safe table elements
288
+ renderer.html = function(html) {
289
+ // Allow specific safe HTML tags for tables
290
+ const allowedTags = ['table', 'thead', 'tbody', 'tr', 'th', 'td'];
291
+ const tagPattern = new RegExp(`</?(?:${allowedTags.join('|')})(?:\\s[^>]*)?>`, 'gi');
292
+
293
+ // Check if the HTML contains only allowed tags
294
+ const strippedHtml = html.replace(tagPattern, '');
295
+ const hasDisallowedTags = /<[^>]+>/.test(strippedHtml);
296
+
297
+ if (!hasDisallowedTags) {
298
+ // Return the HTML if it only contains allowed table tags
299
+ return html;
300
+ }
301
+
302
+ // Strip all HTML by default
303
+ return '';
304
+ };
305
+
306
+ marked.setOptions({
307
+ renderer: renderer,
308
+ breaks: true, // Convert \n to <br>
309
+ gfm: true, // GitHub Flavored Markdown
310
+ pedantic: false,
311
+ smartLists: true,
312
+ smartypants: true,
313
+ headerIds: false, // Disable header IDs for security
314
+ mangle: false, // Don't mangle email addresses
315
+ sanitize: false, // We handle sanitization ourselves
316
+ tables: true // Enable table parsing
317
+ });
318
+
319
+ // Render markdown
320
+ let html = marked.parse(text);
321
+
322
+ // Add Tailwind classes to common elements
323
+ html = html.replace(/<h1>/g, '<h1 class="text-2xl font-bold mb-4 text-gray-900 dark:text-gray-100">');
324
+ html = html.replace(/<h2>/g, '<h2 class="text-xl font-bold mb-3 text-gray-900 dark:text-gray-100">');
325
+ html = html.replace(/<h3>/g, '<h3 class="text-lg font-bold mb-2 text-gray-900 dark:text-gray-100">');
326
+ html = html.replace(/<h4>/g, '<h4 class="text-base font-bold mb-2 text-gray-900 dark:text-gray-100">');
327
+ html = html.replace(/<p>/g, '<p class="mb-4 text-gray-900 dark:text-gray-100">');
328
+ html = html.replace(/<ul>/g, '<ul class="list-disc list-inside mb-4 text-gray-900 dark:text-gray-100">');
329
+ html = html.replace(/<ol>/g, '<ol class="list-decimal list-inside mb-4 text-gray-900 dark:text-gray-100">');
330
+ html = html.replace(/<li>/g, '<li class="mb-1">');
331
+ html = html.replace(/<blockquote>/g, '<blockquote class="border-l-4 border-gray-300 dark:border-gray-600 pl-4 my-4 italic text-gray-700 dark:text-gray-300">');
332
+ html = html.replace(/<code>/g, '<code class="bg-gray-100 dark:bg-gray-800 px-1 py-0.5 rounded text-sm font-mono">');
333
+ html = html.replace(/<pre>/g, '<pre class="bg-gray-100 dark:bg-gray-800 p-4 rounded-lg overflow-x-auto mb-4">');
334
+ html = html.replace(/<table>/g, '<table class="min-w-full divide-y divide-gray-300 dark:divide-gray-600 mb-4 border border-gray-300 dark:border-gray-600">');
335
+ html = html.replace(/<thead>/g, '<thead class="bg-gray-50 dark:bg-gray-800">');
336
+ html = html.replace(/<tbody>/g, '<tbody class="bg-white dark:bg-gray-900 divide-y divide-gray-200 dark:divide-gray-700">');
337
+ html = html.replace(/<tr>/g, '<tr class="hover:bg-gray-50 dark:hover:bg-gray-800">');
338
+ html = html.replace(/<th>/g, '<th class="px-3 py-3.5 text-left text-sm font-semibold text-gray-900 dark:text-gray-100 border border-gray-300 dark:border-gray-600">');
339
+ html = html.replace(/<td>/g, '<td class="px-3 py-4 text-sm text-gray-900 dark:text-gray-100 border border-gray-300 dark:border-gray-600">');
340
+
341
+ const result = `<div class="prose prose-sm dark:prose-invert max-w-none">${html}</div>`;
342
+
343
+ // Cache the result (limit cache size to prevent memory issues)
344
+ if (this.markdownCache.size > 50) {
345
+ // Remove oldest entries
346
+ const firstKey = this.markdownCache.keys().next().value;
347
+ this.markdownCache.delete(firstKey);
348
+ }
349
+ this.markdownCache.set(cacheKey, result);
350
+
351
+ return result;
352
+ } catch (error) {
353
+ console.error('Markdown rendering error:', error);
354
+ return text;
355
+ }
356
+ },
357
+
358
+ getImprovedTextRendered() {
359
+ const text = this.getImprovedText();
360
+ return this.renderMarkdownText(text);
361
+ },
362
+
363
  getImageData() {
364
  if (!this.currentSample) return null;
365
  const columns = this.api.detectColumns(null, this.currentSample);
 
397
  case 'line':
398
  this.diffHtml = createLineDiff(original, improved);
399
  break;
400
+ case 'markdown':
401
+ // Pass the render function bound to this context
402
+ this.diffHtml = createMarkdownDiff(original, improved, (text) => this.renderMarkdownText(text));
403
+ break;
404
  }
405
  },
406
 
js/diff-utils.js CHANGED
@@ -216,4 +216,42 @@ function calculateSimilarity(original, improved) {
216
  const maxLength = Math.max(original.length, improved.length);
217
 
218
  return Math.round((lcsLength / maxLength) * 100);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  }
 
216
  const maxLength = Math.max(original.length, improved.length);
217
 
218
  return Math.round((lcsLength / maxLength) * 100);
219
+ }
220
+
221
+ /**
222
+ * Create markdown-aware diff showing original text vs rendered markdown
223
+ */
224
+ function createMarkdownDiff(original, improved, renderFunction) {
225
+ if (!original || !improved) {
226
+ return '<p class="text-gray-500">No text to compare</p>';
227
+ }
228
+
229
+ let html = '<div class="grid grid-cols-2 gap-6">';
230
+
231
+ // Original text (plain)
232
+ html += '<div>';
233
+ html += '<h4 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">Original (Plain Text)</h4>';
234
+ html += '<div class="font-mono text-xs bg-gray-50 dark:bg-gray-800 text-gray-900 dark:text-gray-100 p-4 rounded-lg overflow-x-auto">';
235
+ html += '<pre class="whitespace-pre-wrap">' + escapeHtml(original) + '</pre>';
236
+ html += '</div>';
237
+ html += '</div>';
238
+
239
+ // Improved text (rendered markdown)
240
+ html += '<div>';
241
+ html += '<h4 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">Improved (Rendered Markdown)</h4>';
242
+ html += '<div class="bg-gray-50 dark:bg-gray-800 p-4 rounded-lg overflow-x-auto">';
243
+
244
+ // Render the markdown using the provided function
245
+ if (renderFunction && typeof renderFunction === 'function') {
246
+ html += renderFunction(improved);
247
+ } else {
248
+ // Fallback to plain text if no render function provided
249
+ html += '<pre class="whitespace-pre-wrap font-mono text-xs">' + escapeHtml(improved) + '</pre>';
250
+ }
251
+
252
+ html += '</div>';
253
+ html += '</div>';
254
+ html += '</div>';
255
+
256
+ return html;
257
  }